Melevisione/gather.py
bparodi@lezzo.org 7708dafafe fuzz
2024-11-05 15:01:56 +01:00

146 lines
4.1 KiB
Python

import json, sys, os, glob
from rapidfuzz import fuzz
from collections import namedtuple
from dateutil import parser
Episode = namedtuple('Episode', ['n', 'date', 'title'])
WellFormed = namedtuple('WellFormed', ['n', 'date', 'title', 'season'])
EpCounter = namedtuple('EpCounter', ['abs', 'rel'])
def reset(counter, season):
season += 1 # we are calling reset from the previous
if season == 11:
rel = 136
elif season == 12:
rel = 236
else:
rel = 1
return EpCounter(counter.abs, rel)
def inc(counter):
return EpCounter(counter.abs + 1, counter.rel + 1)
mesi = ["gennaio", "febbraio", "marzo", "aprile", "maggio",
"giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"]
months= ["january", "february", "march", "april", "may",
"june", "july", "august", "september", "october", "november", "december"]
month_table = dict(zip(mesi, months))
def parse_date(dstr):
dstr = dstr.lower()
for k, v in month_table.items():
if k in dstr:
dstr = dstr.replace(k, v)
return parser.parse(dstr)
counter = EpCounter(1, 1)
melevisione = dict()
scarti = {'Il Genio della pietra'}
for i in range(1, 18):
with open(f'{i}.txt', 'r') as fp:
c = json.loads(fp.read())
eps = [Episode(i[0], i[1], i[2]) for i in c]
eps = [e for e in eps if e[2] not in scarti]
melevisione[i] = eps
expected = {
1: 103,
2: 162,
3: 151,
4: 127,
5: 144,
6: 165,
7: 144,
8: 156,
9: 130,
10: 135,
11: 100,
12: 128,
13: 45,
14: 100,
15: 100,
16: 75,
17: 70
}
def make(n, maybedate, maybetitle, stagione):
def clean(str):
str = str.replace('º', '').replace('\n', ' ')
if '<ref' in str:
idx = str.index('<ref')
str = str[:idx]
starters = [
'25 dicembre 2002',
'8 dicembre 2003',
'21 ottobre 2003',
'19 novembre 2003',
'13 gennaio 2004',
'14 gennaio 2004',
'12 gennaio 2004',
]
for s in starters:
if str.startswith(s):
return s
str = str.replace('º', '1')
return str.strip()
maybedate = clean(maybedate.strip())
maybetitle = clean(maybetitle.strip())
if maybedate.startswith('24 aprile 2001'): # 11 settembre
return WellFormed(134, "11 April 2001", 'Tanti auguri a te', stagione)
elif maybedate[:4].isdigit():
return WellFormed(n, maybedate, maybetitle, stagione)
elif not maybetitle[-4:].isdigit() and maybedate[-4:].isdigit():
return WellFormed(n, maybedate, maybetitle, stagione)
elif maybetitle[-4:].isdigit() and not maybedate[-4:].isdigit():
return WellFormed(n, maybetitle, maybedate, stagione)
else:
raise Exception(f'Dunno: {n}|{maybedate}|{maybetitle}|{stagione}')
# assertions
acc = []
for k, v in melevisione.items():
if expected[k] == len(v):
# Stagione {k} OK'
pass
else:
missing = f'{k}, {expected[k]}, {len(v)}'
segnate = set(map(lambda x: int(x[0]), v))
wanted = set(range(1, expected[k]))
print(wanted - segnate)
assert False
for ep in v:
# ep_ = Episode(int(ep.n), ep.date, ep.title) # TODO: parse date
n = int(ep.n)
if counter.abs != n and counter.rel != n:
assert False, f'{ep}|{counter}|stagione={k}'
else:
pass
# parsed = Episode(counter.abs, parsed_date, ep.title)
ep_ = make(ep.n, ep.date, ep.title, k)
date = parse_date(ep_.date)
ep_ = WellFormed(counter.abs, date.date(), ep_.title, ep_.season)
acc.append(ep_)
counter = inc(counter)
counter = reset(counter, k)
EPISODI_TOTALI = len(scarti) + sum(expected.values()) - 1
assert acc[-1].n == EPISODI_TOTALI, f'{acc[-1].n} != {EPISODI_TOTALI}'
from copy import copy
all_episodes = copy(acc)
all_titles = [a.title for a in acc]
# from IPython import embed as fuck; fuck()
max(all_episodes, key=lambda x: fuzz.ratio(x.title, "la missione di fata"))