diff --git a/17.txt b/17.txt index c411ce2..16423fa 100644 --- a/17.txt +++ b/17.txt @@ -167,7 +167,7 @@ [ "34", "15 febbraio 2015", - "La pace di Mirab\u00e0" + "La pace di Mirabà" ], [ "35", diff --git a/gather.py b/gather.py index 2f841cb..f525ad0 100644 --- a/gather.py +++ b/gather.py @@ -1,5 +1,5 @@ import json, sys, os, glob -from rapidfuzz import fuzz +from rapidfuzz import process, fuzz from collections import namedtuple from dateutil import parser @@ -141,6 +141,45 @@ from copy import copy all_episodes = copy(acc) all_titles = [a.title for a in acc] +all_episodes = {a.title: a for a in all_episodes} # from IPython import embed as fuck; fuck() -max(all_episodes, key=lambda x: fuzz.ratio(x.title, "la missione di fata")) + +targets = [ + "la missione di fata", + "La Melevisione 1999 - Con la carta si può - E1 [3183cb06-5276-4093-bf96-16f7455cb4ff].mp4", + "melevisone 2010la genietta del cuore", + "melevisone 2010la genietta del cuore [pTy6WlKEUIM].mp4", + "melevisione i classici i tre desideri [0luzDKwhu7Y].mp4", + "melevisione 2015 la pace di miraba [oyXErd8BNCQ].mp4", + "melevisione 2015 una balia per il lupo [PiAX2fnm6ps].webm", + "melevisione i claasici i dolori di nina [kaAaYYSbln8].mp4", + "melevisione 2010 la perla dei sette mari [gm_nIC-zfOg].mp4", + "melevisione 2010 l orcoccodrillo [M9TVqW1adS4].mp4", + "2010 l orcoccodrillo [M9TVqW1adS4].mp4", + "Melevisione 2015 il Natale dei bambini cattivi [BdDUTs1-nHY].webm", +] + +def preprocess(t): + starters = ["melevisione ", "La Melevisione ", "i claasici"] + for s in starters: + if t.startswith(s): + t = t[len(s):] + return t + + +for target in targets: + t = preprocess(target) + choices = all_titles + res = process.extract(t, choices, scorer=fuzz.partial_ratio, limit=5) + + best_title, best_score = None, 0 + for r in res: + score = fuzz.token_ratio(r[0].lower(), t.lower()) + # print(score, r) + if score >= best_score: + best_title, best_score = r[0], score + found = all_episodes[best_title] + print('---------------------') + print(f'\t{t} -> {found}') + print('---------------------')