import json import os import calendar from collections import Counter from IPython import embed as fuck for yr in range(2013, 2020): try: os.mkdir('avg_'+str(yr)) except: pass for yr in range(2013, 2020): try: os.mkdir(str(yr)) except: pass try: os.mkdir('all') except: pass try: os.mkdir('trends') except: pass def to_strtime(time, includeYear): if includeYear: return f'{time.day}-{calendar.month_name[time.month][:3]}-{time.year}' else: return f'{time.day}-{calendar.month_name[time.month][:3]}' def compute_trend(p1, y1, p2, y2): from datetime import date, timedelta max_date = date(2019, 11, 24) if y1 == 'all': assert y2 == 'all' start = date(2018, 1, 1) if 'renzi' in (p1, p2) else date(2013, 1, 1) end = max_date else: mmin = min((y1, y2)) mmax = max((y1, y2)) start = date(mmin, 1, 1) end = date(mmax, 12, 31) if end > max_date: end = max_date delta = end - start assert delta.days > 0, fuck() filename = f'trends/{y1}_{p1}_{y2}_{p2}_trends.tsv' with open(str(y1)+'/'+p1+'_trend.json', 'r') as f: trend1 = json.loads(f.read()) with open(str(y2)+'/'+p2+'_trend.json', 'r') as f: trend2 = json.loads(f.read()) with open(filename, 'w') as f: f.write(f'date\t{p1.capitalize()}-{str(y1)[2:]}\t{p2.capitalize()}-{str(y2)[2:]}\n') cnt = 0 for i in range(365): cnt+=1 day = start + timedelta(days=i) k = to_strtime(day, False) v1 = 0 if k not in trend1 else trend1[k] v2 = 0 if k not in trend2 else trend2[k] mth = calendar.month_name[day.month][:3] jday = f'70-{mth}-{day.day}' if y1 == 'all' else k #with year: to_strtime(day, y1 != 'all') f.write(f'{jday}\t{v1}\t{v2}\n') print('wrote:', cnt,'trends') def compute_possible_trends(): from itertools import repeat, combinations r = zip(repeat(('renzi')), (2018, 2019)) s = zip(repeat(('salvini')), range(2013, 2020)) t = tuple(r) + tuple(s) poss = list(combinations(t, 2)) poss.append([('renzi', 'all'), ('salvini', 'all')]) try: for p1, p2 in poss: compute_trend(*p1, *p2) except Exception as e: fuck() import sys sys.exit() def parse_politico(filename): from dateutil.parser import parse as dateparse politico = dict() with open(filename, 'r') as f: content = [json.loads(l) for l in f.readlines()] all = [] for j in content : # parse dates and removed unused keys j['time'] = dateparse(j['time']) for k in ['text', 'shared_text', 'link', 'post_url', 'shares', 'comments', 'likes', 'image', 'post_id']: j.pop(k) all.append(j) politico['all'] = all # per-year posts and avg post length / year years = set([y['time'].year for y in all]) for yr in years: yposts = [j for j in all if j['time'].year == yr] politico[yr] = yposts avg = 0 for p in yposts: avg += len(p['post_text']) avg = int(avg/len(yposts)) politico["avg_"+str(yr)] = avg print('Parsed', filename) return politico def calcolo_comuni(words): with open('comuni.json', 'r') as f: comuni_content = f.read() with open('world_cities.json', 'r') as f: world_content = f.read() comuniCoord = dict() for j in json.loads(comuni_content): comuniCoord[j['name'].upper()] = j # loaded comuni italiani mondo = dict() for j in json.loads(world_content): name = j['name'].upper() cc = j['country_code'] if cc not in ['CA', 'US']: j['lat'], j['lng'] = j['latitude'], j['longitude'] mondo[name] = j interesse = {'PARIGI': 'PARIS', 'MOSCA': 'MOSCOW', 'BERLINO': 'BERLIN', 'LONDRA': 'LONDON', 'BRUXELLES': 'BRUSSELS'} europa = {key: mondo[value] for key, value in interesse.items()} comuniCoord.update(europa) comuni = set(comuniCoord.keys()) comuni.remove("ALFANO") comuni.remove("PAESE") comuni.remove("FONDO") comuni.remove("VITA") comuni.remove("CARDINALE") comuni.remove("PARENTI") comuni.remove("AMATO") comuni.remove("BELLA") comuni.remove("LIBERI") comuni.remove("BOMBA") comuni.remove("POPOLI") comuni.remove("MENTANA") comuni.remove("MONTI") comuni.remove("CALCI") comuni.remove("ORA") comuni.remove("DON") comuni.remove("PREZZO") comuni.remove("CALCIO") comuni.remove("MACELLO") comuni.remove("RUSSI") comuni.remove("PORTE") visitati = filter(lambda w: w.upper() in comuni, words) return {'locations':[comuniCoord[v.upper()] for v in visitati]} def write_words(politico, prefix): # a file for each year and one for total # prefix is prefix of filename import string import emoji punctuations = string.punctuation + ''.join(["“", "’", "…"]) with open('italian_stopwords', 'r') as f: personal = [str(i) for i in range(50)] + ['tre', 'ps', '15', '23', 'fra', 'va', 'ce', 'due', 'co', 'qui', 'di', 'far', 'di', 'sa', 'c’è', 'quattro', 'cinque', 'sei', 'sette', 'otto', 'nove', 'dieci', 'post', 'd', 'p', 's', 'de', 'ly'] personal.remove('18') stopwords = set([w.strip() for w in f.readlines()] + personal) for k, v in politico.items(): # v keys: # 'all' one year of posts or all posts # '' posts for year '' # 'avg_' avg post length for year '' words = [] emojis = [] sonno = dict() # generate layout for month in calendar.month_name: sonno[month[:3]] = dict() for i in range(24): sonno[month[:3]][i] = 0 filename_words = str(k)+'/'+prefix+'_'+'words' filename_counter = str(k)+'/'+prefix+'_'+'counter' filename_sleep = str(k)+'/'+prefix+'_'+'sleep' filename_emoji = str(k)+'/'+prefix+'_'+'emoji' filename_comuni = str(k)+'/'+prefix+'_'+'comuni.html' filename_trend = str(k)+'/'+prefix+'_trend.json' filename_avg = str(k) +'/'+ prefix # write avg post length if str(k).startswith('avg'): with open(filename_avg, 'w') as f: f.write(str(v)) print('Wrote', filename_avg) continue def filter_word(w): return w not in stopwords and not w.isdigit() and w != '' and w[0] != ' ' trends = list() # keep track of days for posts for j in v: text = j['post_text'].replace('\n', ' ') for punct in punctuations: text = text.replace(punct, ' ').lower() words.extend(filter(filter_word , text.split(' '))) emojis.extend(filter(lambda w: w in emoji.UNICODE_EMOJI, text.split(' '))) assert k in list(range(2013, 2020))+['all'], k time = j['time'] sonno[calendar.month_name[time.month][:3]][time.hour] += 1 trends.append(to_strtime(time, k == 'all')) print('Computed stemmed words, sleep and emoji counting') with open(filename_words, 'w') as f: f.writelines([w+'\n' for w in words]) print('Wrote', filename_words) wcounter = Counter(words) with open(filename_counter, 'w') as f: f.write(json.dumps(wcounter)) print('Wrote', filename_counter) ecounter = Counter(emojis) with open(filename_emoji, 'w') as f: f.write(json.dumps(ecounter)) print('Wrote', filename_emoji) with open(filename_trend, 'w') as f: f.write(json.dumps(Counter(trends))) print('Wrote', filename_trend) with open(filename_sleep, 'w') as f: f.write(json.dumps(sonno)) print('Wrote', filename_sleep) print('Calcolo comuni') comuni_visitati = calcolo_comuni(words) from make_heatmap import Generator as HeatMapGen generator = HeatMapGen() assert type(comuni_visitati is dict) generator.run(comuni_visitati, filename_comuni) print() if __name__ == '__main__': renzi = parse_politico('matteorenziufficiale.json') write_words(renzi, 'renzi') salvini = parse_politico('salviniofficial.json') write_words(salvini, 'salvini') compute_possible_trends()