UniTO/anno3/avrc/assignments/dataviz/dataset/make.py

import json
import os
import calendar
from collections import Counter

from IPython import embed as fuck

for yr in range(2013, 2020):
    try:
        os.mkdir('avg_'+str(yr))
    except:
        pass
for yr in range(2013, 2020):
    try:
        os.mkdir(str(yr))
    except:
        pass
try:
    os.mkdir('all')
except:
    pass
try:
    os.mkdir('trends')
except:
    pass

def to_strtime(time, includeYear):
    if includeYear:
        return f'{time.day}-{calendar.month_name[time.month][:3]}-{time.year}'
    else:
        return f'{time.day}-{calendar.month_name[time.month][:3]}'


def compute_trend(p1, y1, p2, y2):
    from datetime import date, timedelta

    max_date = date(2019, 11, 24)
    if y1 == 'all':
        assert y2 == 'all'
        start = date(2018, 1, 1) if 'renzi' in (p1, p2) else date(2013, 1, 1)
        end = max_date
    else:
        mmin = min((y1, y2))
        mmax = max((y1, y2))
        start = date(mmin, 1, 1)
        end = date(mmax, 12, 31)
        if end > max_date:
            end = max_date
    delta = end - start
    assert delta.days > 0, fuck()

    filename = f'trends/{y1}_{p1}_{y2}_{p2}_trends.tsv'
    with open(str(y1)+'/'+p1+'_trend.json', 'r') as f:
        trend1 = json.loads(f.read())
    with open(str(y2)+'/'+p2+'_trend.json', 'r') as f:
        trend2 = json.loads(f.read())

    with open(filename, 'w') as f:
        f.write(f'date\t{p1.capitalize()}-{str(y1)[2:]}\t{p2.capitalize()}-{str(y2)[2:]}\n')
        cnt = 0
        for i in range(365):
            cnt+=1
            day = start + timedelta(days=i)
            k = to_strtime(day, False)
            v1 = 0 if k not in trend1 else trend1[k]
            v2 = 0 if k not in trend2 else trend2[k]
            mth = calendar.month_name[day.month][:3]
            jday = f'70-{mth}-{day.day}' if y1 == 'all' else k #with year: to_strtime(day, y1 != 'all')
            f.write(f'{jday}\t{v1}\t{v2}\n')
        print('wrote:', cnt,'trends')

def compute_possible_trends():
    from itertools import repeat, combinations
    r = zip(repeat(('renzi')), (2018, 2019))
    s = zip(repeat(('salvini')), range(2013, 2020))
    t = tuple(r) + tuple(s)
    poss = list(combinations(t, 2))
    poss.append([('renzi', 'all'), ('salvini', 'all')])
    try:
        for p1, p2 in poss:
            compute_trend(*p1, *p2)
    except Exception as e:
        fuck()
        import sys
        sys.exit()


def parse_politico(filename):
    from dateutil.parser import parse as dateparse

    politico = dict()
    with open(filename, 'r') as f:
        content = [json.loads(l) for l in f.readlines()]

    all = []
    for j in content : # parse dates and removed unused keys
        j['time'] = dateparse(j['time'])
        for k in ['text', 'shared_text', 'link', 'post_url', 'shares', 'comments', 'likes', 'image', 'post_id']:
            j.pop(k)
        all.append(j)

    politico['all'] = all

    # per-year posts and avg post length / year
    years = set([y['time'].year for y in all])
    for yr in years:
        yposts = [j for j in all if j['time'].year == yr]
        politico[yr] = yposts
        avg = 0
        for p in yposts:
            avg += len(p['post_text'])
        avg = int(avg/len(yposts))
        politico["avg_"+str(yr)] = avg

    print('Parsed', filename)
    return politico


def calcolo_comuni(words):
    with open('comuni.json', 'r') as f:
        comuni_content = f.read()

    with open('world_cities.json', 'r') as f:
        world_content = f.read()


    comuniCoord = dict()
    for j in json.loads(comuni_content):
        comuniCoord[j['name'].upper()] = j
        # loaded comuni italiani

    mondo = dict()
    for j in json.loads(world_content):
        name = j['name'].upper()
        cc = j['country_code']
        if cc not in ['CA', 'US']:
            j['lat'], j['lng'] = j['latitude'], j['longitude']
            mondo[name] = j

    interesse = {'PARIGI': 'PARIS', 'MOSCA': 'MOSCOW', 'BERLINO': 'BERLIN', 'LONDRA': 'LONDON', 'BRUXELLES': 'BRUSSELS'}
    europa = {key: mondo[value] for key, value in interesse.items()}
    comuniCoord.update(europa)

    comuni = set(comuniCoord.keys())
    comuni.remove("ALFANO")
    comuni.remove("PAESE")
    comuni.remove("FONDO")
    comuni.remove("VITA")
    comuni.remove("CARDINALE")
    comuni.remove("PARENTI")
    comuni.remove("AMATO")
    comuni.remove("BELLA")
    comuni.remove("LIBERI")
    comuni.remove("BOMBA")
    comuni.remove("POPOLI")
    comuni.remove("MENTANA")
    comuni.remove("MONTI")
    comuni.remove("CALCI")
    comuni.remove("ORA")
    comuni.remove("DON")
    comuni.remove("PREZZO")
    comuni.remove("CALCIO")
    comuni.remove("MACELLO")
    comuni.remove("RUSSI")
    comuni.remove("PORTE")

    visitati = filter(lambda w: w.upper() in comuni, words)
    return {'locations':[comuniCoord[v.upper()] for v in visitati]}

def write_words(politico, prefix):
    # a file for each year and one for total
    # prefix is prefix of filename
    import string
    import emoji

    punctuations = string.punctuation + ''.join(["“", "’", "…"])
    with open('italian_stopwords', 'r') as f:
        personal = [str(i) for i in range(50)] + ['tre', 'ps', '15', '23', 'fra', 'va', 'ce', 'due', 'co', 'qui', 'di', 'far', 'di', 'sa', 'c’è', 'quattro', 'cinque', 'sei', 'sette', 'otto', 'nove', 'dieci', 'post', 'd', 'p', 's', 'de', 'ly']
        personal.remove('18')
        stopwords = set([w.strip() for w in f.readlines()] + personal)

    for k, v in politico.items():
        # v keys:
        # 'all' one year of posts or all posts
        # '<year>' posts for year '<year>'
        # 'avg_<year>' avg post length for year '<year>'
        words = []
        emojis = []
        sonno = dict()
        # generate layout
        for month in calendar.month_name:
            sonno[month[:3]] = dict()
            for i in range(24):
                sonno[month[:3]][i] = 0

        filename_words = str(k)+'/'+prefix+'_'+'words'
        filename_counter =  str(k)+'/'+prefix+'_'+'counter'
        filename_sleep =  str(k)+'/'+prefix+'_'+'sleep'
        filename_emoji =  str(k)+'/'+prefix+'_'+'emoji'
        filename_comuni =  str(k)+'/'+prefix+'_'+'comuni.html'
        filename_trend =  str(k)+'/'+prefix+'_trend.json'
        filename_avg = str(k) +'/'+ prefix

        # write avg post length
        if str(k).startswith('avg'):
            with open(filename_avg, 'w') as f:
                f.write(str(v))
            print('Wrote', filename_avg)
            continue

        def filter_word(w):
            return w not in stopwords and not w.isdigit() and w != '' and w[0] != ' '

        trends = list() # keep track of days for posts
        for j in v:
            text = j['post_text'].replace('\n', ' ')
            for punct in punctuations:
                text = text.replace(punct, ' ').lower()
            words.extend(filter(filter_word , text.split(' ')))
            emojis.extend(filter(lambda w: w in emoji.UNICODE_EMOJI, text.split(' ')))

            assert k in list(range(2013, 2020))+['all'], k
            time = j['time']
            sonno[calendar.month_name[time.month][:3]][time.hour] += 1
            trends.append(to_strtime(time, k == 'all'))
        print('Computed stemmed words, sleep and emoji counting')

        with open(filename_words, 'w') as f:
            f.writelines([w+'\n' for w in words])
        print('Wrote', filename_words)

        wcounter = Counter(words)
        with open(filename_counter, 'w') as f:
            f.write(json.dumps(wcounter))
        print('Wrote', filename_counter)

        ecounter = Counter(emojis)
        with open(filename_emoji, 'w') as f:
            f.write(json.dumps(ecounter))
        print('Wrote', filename_emoji)

        with open(filename_trend, 'w') as f:
            f.write(json.dumps(Counter(trends)))
        print('Wrote', filename_trend)

        with open(filename_sleep, 'w') as f:
            f.write(json.dumps(sonno))
        print('Wrote', filename_sleep)

        print('Calcolo comuni')
        comuni_visitati = calcolo_comuni(words)
        from make_heatmap import Generator as HeatMapGen
        generator = HeatMapGen()
        assert type(comuni_visitati is dict)
        generator.run(comuni_visitati, filename_comuni)
        print()


if __name__ == '__main__':
    renzi = parse_politico('matteorenziufficiale.json')
    write_words(renzi, 'renzi')
    salvini = parse_politico('salviniofficial.json')
    write_words(salvini, 'salvini')

    compute_possible_trends()