265 lines
8.3 KiB
Python
Executable file
265 lines
8.3 KiB
Python
Executable file
import json
|
||
import os
|
||
import calendar
|
||
from collections import Counter
|
||
|
||
from IPython import embed as fuck
|
||
|
||
for yr in range(2013, 2020):
|
||
try:
|
||
os.mkdir('avg_'+str(yr))
|
||
except:
|
||
pass
|
||
for yr in range(2013, 2020):
|
||
try:
|
||
os.mkdir(str(yr))
|
||
except:
|
||
pass
|
||
try:
|
||
os.mkdir('all')
|
||
except:
|
||
pass
|
||
try:
|
||
os.mkdir('trends')
|
||
except:
|
||
pass
|
||
|
||
def to_strtime(time, includeYear):
|
||
if includeYear:
|
||
return f'{time.day}-{calendar.month_name[time.month][:3]}-{time.year}'
|
||
else:
|
||
return f'{time.day}-{calendar.month_name[time.month][:3]}'
|
||
|
||
|
||
def compute_trend(p1, y1, p2, y2):
|
||
from datetime import date, timedelta
|
||
|
||
max_date = date(2019, 11, 24)
|
||
if y1 == 'all':
|
||
assert y2 == 'all'
|
||
start = date(2018, 1, 1) if 'renzi' in (p1, p2) else date(2013, 1, 1)
|
||
end = max_date
|
||
else:
|
||
mmin = min((y1, y2))
|
||
mmax = max((y1, y2))
|
||
start = date(mmin, 1, 1)
|
||
end = date(mmax, 12, 31)
|
||
if end > max_date:
|
||
end = max_date
|
||
delta = end - start
|
||
assert delta.days > 0, fuck()
|
||
|
||
filename = f'trends/{y1}_{p1}_{y2}_{p2}_trends.tsv'
|
||
with open(str(y1)+'/'+p1+'_trend.json', 'r') as f:
|
||
trend1 = json.loads(f.read())
|
||
with open(str(y2)+'/'+p2+'_trend.json', 'r') as f:
|
||
trend2 = json.loads(f.read())
|
||
|
||
with open(filename, 'w') as f:
|
||
f.write(f'date\t{p1.capitalize()}-{str(y1)[2:]}\t{p2.capitalize()}-{str(y2)[2:]}\n')
|
||
cnt = 0
|
||
for i in range(365):
|
||
cnt+=1
|
||
day = start + timedelta(days=i)
|
||
k = to_strtime(day, False)
|
||
v1 = 0 if k not in trend1 else trend1[k]
|
||
v2 = 0 if k not in trend2 else trend2[k]
|
||
mth = calendar.month_name[day.month][:3]
|
||
jday = f'70-{mth}-{day.day}' if y1 == 'all' else k #with year: to_strtime(day, y1 != 'all')
|
||
f.write(f'{jday}\t{v1}\t{v2}\n')
|
||
print('wrote:', cnt,'trends')
|
||
|
||
def compute_possible_trends():
|
||
from itertools import repeat, combinations
|
||
r = zip(repeat(('renzi')), (2018, 2019))
|
||
s = zip(repeat(('salvini')), range(2013, 2020))
|
||
t = tuple(r) + tuple(s)
|
||
poss = list(combinations(t, 2))
|
||
poss.append([('renzi', 'all'), ('salvini', 'all')])
|
||
try:
|
||
for p1, p2 in poss:
|
||
compute_trend(*p1, *p2)
|
||
except Exception as e:
|
||
fuck()
|
||
import sys
|
||
sys.exit()
|
||
|
||
|
||
def parse_politico(filename):
|
||
from dateutil.parser import parse as dateparse
|
||
|
||
politico = dict()
|
||
with open(filename, 'r') as f:
|
||
content = [json.loads(l) for l in f.readlines()]
|
||
|
||
all = []
|
||
for j in content : # parse dates and removed unused keys
|
||
j['time'] = dateparse(j['time'])
|
||
for k in ['text', 'shared_text', 'link', 'post_url', 'shares', 'comments', 'likes', 'image', 'post_id']:
|
||
j.pop(k)
|
||
all.append(j)
|
||
|
||
politico['all'] = all
|
||
|
||
# per-year posts and avg post length / year
|
||
years = set([y['time'].year for y in all])
|
||
for yr in years:
|
||
yposts = [j for j in all if j['time'].year == yr]
|
||
politico[yr] = yposts
|
||
avg = 0
|
||
for p in yposts:
|
||
avg += len(p['post_text'])
|
||
avg = int(avg/len(yposts))
|
||
politico["avg_"+str(yr)] = avg
|
||
|
||
print('Parsed', filename)
|
||
return politico
|
||
|
||
|
||
def calcolo_comuni(words):
|
||
with open('comuni.json', 'r') as f:
|
||
comuni_content = f.read()
|
||
|
||
with open('world_cities.json', 'r') as f:
|
||
world_content = f.read()
|
||
|
||
|
||
comuniCoord = dict()
|
||
for j in json.loads(comuni_content):
|
||
comuniCoord[j['name'].upper()] = j
|
||
# loaded comuni italiani
|
||
|
||
mondo = dict()
|
||
for j in json.loads(world_content):
|
||
name = j['name'].upper()
|
||
cc = j['country_code']
|
||
if cc not in ['CA', 'US']:
|
||
j['lat'], j['lng'] = j['latitude'], j['longitude']
|
||
mondo[name] = j
|
||
|
||
interesse = {'PARIGI': 'PARIS', 'MOSCA': 'MOSCOW', 'BERLINO': 'BERLIN', 'LONDRA': 'LONDON', 'BRUXELLES': 'BRUSSELS'}
|
||
europa = {key: mondo[value] for key, value in interesse.items()}
|
||
comuniCoord.update(europa)
|
||
|
||
comuni = set(comuniCoord.keys())
|
||
comuni.remove("ALFANO")
|
||
comuni.remove("PAESE")
|
||
comuni.remove("FONDO")
|
||
comuni.remove("VITA")
|
||
comuni.remove("CARDINALE")
|
||
comuni.remove("PARENTI")
|
||
comuni.remove("AMATO")
|
||
comuni.remove("BELLA")
|
||
comuni.remove("LIBERI")
|
||
comuni.remove("BOMBA")
|
||
comuni.remove("POPOLI")
|
||
comuni.remove("MENTANA")
|
||
comuni.remove("MONTI")
|
||
comuni.remove("CALCI")
|
||
comuni.remove("ORA")
|
||
comuni.remove("DON")
|
||
comuni.remove("PREZZO")
|
||
comuni.remove("CALCIO")
|
||
comuni.remove("MACELLO")
|
||
comuni.remove("RUSSI")
|
||
comuni.remove("PORTE")
|
||
|
||
visitati = filter(lambda w: w.upper() in comuni, words)
|
||
return {'locations':[comuniCoord[v.upper()] for v in visitati]}
|
||
|
||
def write_words(politico, prefix):
|
||
# a file for each year and one for total
|
||
# prefix is prefix of filename
|
||
import string
|
||
import emoji
|
||
|
||
punctuations = string.punctuation + ''.join(["“", "’", "…"])
|
||
with open('italian_stopwords', 'r') as f:
|
||
personal = [str(i) for i in range(50)] + ['tre', 'ps', '15', '23', 'fra', 'va', 'ce', 'due', 'co', 'qui', 'di', 'far', 'di', 'sa', 'c’è', 'quattro', 'cinque', 'sei', 'sette', 'otto', 'nove', 'dieci', 'post', 'd', 'p', 's', 'de', 'ly']
|
||
personal.remove('18')
|
||
stopwords = set([w.strip() for w in f.readlines()] + personal)
|
||
|
||
for k, v in politico.items():
|
||
# v keys:
|
||
# 'all' one year of posts or all posts
|
||
# '<year>' posts for year '<year>'
|
||
# 'avg_<year>' avg post length for year '<year>'
|
||
words = []
|
||
emojis = []
|
||
sonno = dict()
|
||
# generate layout
|
||
for month in calendar.month_name:
|
||
sonno[month[:3]] = dict()
|
||
for i in range(24):
|
||
sonno[month[:3]][i] = 0
|
||
|
||
filename_words = str(k)+'/'+prefix+'_'+'words'
|
||
filename_counter = str(k)+'/'+prefix+'_'+'counter'
|
||
filename_sleep = str(k)+'/'+prefix+'_'+'sleep'
|
||
filename_emoji = str(k)+'/'+prefix+'_'+'emoji'
|
||
filename_comuni = str(k)+'/'+prefix+'_'+'comuni.html'
|
||
filename_trend = str(k)+'/'+prefix+'_trend.json'
|
||
filename_avg = str(k) +'/'+ prefix
|
||
|
||
# write avg post length
|
||
if str(k).startswith('avg'):
|
||
with open(filename_avg, 'w') as f:
|
||
f.write(str(v))
|
||
print('Wrote', filename_avg)
|
||
continue
|
||
|
||
def filter_word(w):
|
||
return w not in stopwords and not w.isdigit() and w != '' and w[0] != ' '
|
||
|
||
trends = list() # keep track of days for posts
|
||
for j in v:
|
||
text = j['post_text'].replace('\n', ' ')
|
||
for punct in punctuations:
|
||
text = text.replace(punct, ' ').lower()
|
||
words.extend(filter(filter_word , text.split(' ')))
|
||
emojis.extend(filter(lambda w: w in emoji.UNICODE_EMOJI, text.split(' ')))
|
||
|
||
assert k in list(range(2013, 2020))+['all'], k
|
||
time = j['time']
|
||
sonno[calendar.month_name[time.month][:3]][time.hour] += 1
|
||
trends.append(to_strtime(time, k == 'all'))
|
||
print('Computed stemmed words, sleep and emoji counting')
|
||
|
||
with open(filename_words, 'w') as f:
|
||
f.writelines([w+'\n' for w in words])
|
||
print('Wrote', filename_words)
|
||
|
||
wcounter = Counter(words)
|
||
with open(filename_counter, 'w') as f:
|
||
f.write(json.dumps(wcounter))
|
||
print('Wrote', filename_counter)
|
||
|
||
ecounter = Counter(emojis)
|
||
with open(filename_emoji, 'w') as f:
|
||
f.write(json.dumps(ecounter))
|
||
print('Wrote', filename_emoji)
|
||
|
||
with open(filename_trend, 'w') as f:
|
||
f.write(json.dumps(Counter(trends)))
|
||
print('Wrote', filename_trend)
|
||
|
||
with open(filename_sleep, 'w') as f:
|
||
f.write(json.dumps(sonno))
|
||
print('Wrote', filename_sleep)
|
||
|
||
print('Calcolo comuni')
|
||
comuni_visitati = calcolo_comuni(words)
|
||
from make_heatmap import Generator as HeatMapGen
|
||
generator = HeatMapGen()
|
||
assert type(comuni_visitati is dict)
|
||
generator.run(comuni_visitati, filename_comuni)
|
||
print()
|
||
|
||
|
||
if __name__ == '__main__':
|
||
renzi = parse_politico('matteorenziufficiale.json')
|
||
write_words(renzi, 'renzi')
|
||
salvini = parse_politico('salviniofficial.json')
|
||
write_words(salvini, 'salvini')
|
||
|
||
compute_possible_trends()
|