Melevisione/gather.py
bparodi@lezzo.org 122598405d more
2024-11-05 16:45:42 +01:00

197 lines
6.1 KiB
Python

import json, sys, os, glob
from rapidfuzz import process, fuzz
from collections import namedtuple
from dateutil import parser
Episode = namedtuple('Episode', ['n', 'date', 'title'])
WellFormed = namedtuple('WellFormed', ['n', 'date', 'title', 'season'])
EpCounter = namedtuple('EpCounter', ['abs', 'rel'])
def reset(counter, season):
season += 1 # we are calling reset from the previous
if season == 11:
rel = 136
elif season == 12:
rel = 236
else:
rel = 1
return EpCounter(counter.abs, rel)
def inc(counter):
return EpCounter(counter.abs + 1, counter.rel + 1)
mesi = ["gennaio", "febbraio", "marzo", "aprile", "maggio",
"giugno", "luglio", "agosto", "settembre", "ottobre", "novembre", "dicembre"]
months= ["january", "february", "march", "april", "may",
"june", "july", "august", "september", "october", "november", "december"]
month_table = dict(zip(mesi, months))
def parse_date(dstr):
dstr = dstr.lower()
for k, v in month_table.items():
if k in dstr:
dstr = dstr.replace(k, v)
return parser.parse(dstr)
counter = EpCounter(1, 1)
melevisione = dict()
scarti = {'Il Genio della pietra'}
for i in range(1, 18):
with open(f'{i}.txt', 'r') as fp:
c = json.loads(fp.read())
eps = [Episode(i[0], i[1], i[2]) for i in c]
eps = [e for e in eps if e[2] not in scarti]
melevisione[i] = eps
expected = {
1: 103,
2: 162,
3: 151,
4: 127,
5: 144,
6: 165,
7: 144,
8: 156,
9: 130,
10: 135,
11: 100,
12: 128,
13: 45,
14: 100,
15: 100,
16: 75,
17: 70
}
def make(n, maybedate, maybetitle, stagione):
def clean(str):
str = str.replace('º', '').replace('\n', ' ')
if '<ref' in str:
idx = str.index('<ref')
str = str[:idx]
starters = [
'25 dicembre 2002',
'8 dicembre 2003',
'21 ottobre 2003',
'19 novembre 2003',
'13 gennaio 2004',
'14 gennaio 2004',
'12 gennaio 2004',
]
for s in starters:
if str.startswith(s):
return s
str = str.replace('º', '1')
return str.strip()
maybedate = clean(maybedate.strip())
maybetitle = clean(maybetitle.strip())
if maybedate.startswith('24 aprile 2001'): # 11 settembre
return WellFormed(134, "11 April 2001", 'Tanti auguri a te', stagione)
elif maybedate[:4].isdigit():
return WellFormed(n, maybedate, maybetitle, stagione)
elif not maybetitle[-4:].isdigit() and maybedate[-4:].isdigit():
return WellFormed(n, maybedate, maybetitle, stagione)
elif maybetitle[-4:].isdigit() and not maybedate[-4:].isdigit():
return WellFormed(n, maybetitle, maybedate, stagione)
else:
raise Exception(f'Dunno: {n}|{maybedate}|{maybetitle}|{stagione}')
# assertions
acc = []
for k, v in melevisione.items():
if expected[k] == len(v):
# Stagione {k} OK'
pass
else:
missing = f'{k}, {expected[k]}, {len(v)}'
segnate = set(map(lambda x: int(x[0]), v))
wanted = set(range(1, expected[k]))
print(wanted - segnate)
assert False
for ep in v:
# ep_ = Episode(int(ep.n), ep.date, ep.title) # TODO: parse date
n = int(ep.n)
if counter.abs != n and counter.rel != n:
assert False, f'{ep}|{counter}|stagione={k}'
else:
pass
# parsed = Episode(counter.abs, parsed_date, ep.title)
ep_ = make(ep.n, ep.date, ep.title, k)
date = parse_date(ep_.date)
ep_ = WellFormed(counter.abs, date.date(), ep_.title, ep_.season)
acc.append(ep_)
counter = inc(counter)
counter = reset(counter, k)
EPISODI_TOTALI = len(scarti) + sum(expected.values()) - 1
assert acc[-1].n == EPISODI_TOTALI, f'{acc[-1].n} != {EPISODI_TOTALI}'
from copy import copy
all_episodes = copy(acc)
all_titles = [a.title for a in acc]
all_episodes = {a.title: a for a in all_episodes}
# from IPython import embed as fuck; fuck()
targets = [
"la missione di fata",
"La Melevisione 1999 - Con la carta si può - E1 [3183cb06-5276-4093-bf96-16f7455cb4ff].mp4",
"melevisone 2010la genietta del cuore",
"melevisone 2010la genietta del cuore [pTy6WlKEUIM].mp4",
"melevisione i classici i tre desideri [0luzDKwhu7Y].mp4",
"melevisione 2015 la pace di miraba [oyXErd8BNCQ].mp4",
"melevisione 2015 una balia per il lupo [PiAX2fnm6ps].webm",
"melevisione i claasici i dolori di nina [kaAaYYSbln8].mp4",
"melevisione 2010 la perla dei sette mari [gm_nIC-zfOg].mp4",
"melevisione 2010 l orcoccodrillo [M9TVqW1adS4].mp4",
"2010 l orcoccodrillo [M9TVqW1adS4].mp4",
"Melevisione 2015 il Natale dei bambini cattivi [BdDUTs1-nHY].webm",
"Melevisione 2013 Un giorno senza magia [CqFRsLWe1PA].webm",
"Melevisione 2013 il drago verde [kz96ZJ6miFY].webm",
"Melevisione 2013 il faraone di vermiosis [xBJ4m5ACET8].webm",
"Melevisione 2013 una pianta coi denti [5JeycqtKm2E].mp4",
"Melevisione 2014 il dono di fata erbinia [mR0aGpsAeA8].mp4",
"Melevisione 2014 il drago ha sette teste [UKhRpZjfapA].webm",
"Melevisione 2014 pignabolli che passione [q51VlqWt_pM].webm",
"Melevisione 2014 una dolce Pianta [eUIuoOwZyUs].webm",
"Melevisione 2014 una gatta con le piume [rbscLLyJlPw].mp4",
"Melevisione 2014, tutti tranne barba blu [wmq5Q6ip5UQ].webm",
]
def preprocess(t):
starters = ["melevisione ", "La Melevisione ", "i claasici", "Melevisione 20"]
for s in starters:
if t.startswith(s):
t = t[len(s):]
return t
for target in targets:
t = preprocess(target)
choices = all_titles
res = process.extract(t, choices, scorer=fuzz.partial_ratio, limit=5)
best_title, best_score = None, 0
for r in res:
score = fuzz.token_ratio(r[0].lower(), t.lower())
# print(score, r)
if score >= best_score:
best_title, best_score = r[0], score
found = all_episodes[best_title]
print('---------------------')
print(f'\t{t} -> {found}')
print('---------------------')