120 lines
4.2 KiB
Python
120 lines
4.2 KiB
Python
import re
|
|
import json
|
|
import sys
|
|
import pprint as pp
|
|
from collections import namedtuple
|
|
|
|
Finito = namedtuple('Finito', ['Number', 'Date', 'Title'])
|
|
|
|
|
|
def parse_wiki_source(wiki_source, stagione):
|
|
Episode = namedtuple('Episode', ['N_tot', 'N', 'Prima_TV_ita', 'Nome_della_puntata'])
|
|
|
|
pattern = re.compile(
|
|
r'\|\s*(\d+)\s*\|\s*([^\|]*)\s*\|\s*([^\|]+)\s*\|\s*([^\|]+)\s*\|'
|
|
)
|
|
other = re.compile(r'\|\s*(\d+)\s*\|\|\s*(\d+)\s*\n\|\'\'([^\']+)\'\'\n\|(\d{1,2}\s+\w+\s+\d{4})\n\|([^\n]+)')
|
|
|
|
matches = pattern.findall(wiki_source)
|
|
matches2 = other.findall(wiki_source)
|
|
|
|
episodes = []
|
|
for match in matches:
|
|
print(f'----------------------- {list(match)}')
|
|
if stagione == 8:
|
|
n_tot, prima_tv_ita, nome_della_puntata, n = match
|
|
elif stagione not in {13, 14, 15, 16, 17}:
|
|
n_tot, n, prima_tv_ita, nome_della_puntata = match
|
|
else:
|
|
n_tot, nome_della_puntata, prima_tv_ita, n = [m.strip() for m in match]
|
|
if n.endswith('2011') or n.endswith('2012'):
|
|
nome_della_puntata = prima_tv_ita
|
|
prima_tv_ita = n
|
|
if prima_tv_ita.startswith('rowspan'):
|
|
prima_tv_ita = n
|
|
nome_della_puntata = nome_della_puntata.strip(']]').strip("'").strip(']').replace("''", '')
|
|
print('FFFFFFF', list(match))
|
|
|
|
if n.strip() == '':
|
|
n = None
|
|
episodes.append(Episode(n_tot, n, prima_tv_ita, nome_della_puntata))
|
|
|
|
for e in episodes:
|
|
e_ = Finito(e.N_tot, e.Prima_TV_ita.strip(), e.Nome_della_puntata.strip())
|
|
yield e_
|
|
|
|
|
|
def parse_melevisione_data(text, ep_number):
|
|
results = []
|
|
lines = text.split('\n')
|
|
i = 0
|
|
while i < len(lines):
|
|
if lines[i].startswith('|') and '||' in lines[i]:
|
|
# This line contains episode numbers
|
|
numbers = lines[i].strip('|').split('||')
|
|
if len(numbers) >= 1:
|
|
episode_num1 = numbers[0].strip()
|
|
# episode_num2 = numbers[1].strip() if len(numbers) >= 2 else ''
|
|
|
|
# Next line should be the title
|
|
i += 1
|
|
if i < len(lines) and (lines[i].startswith("|''") or lines[i].startswith('|')):
|
|
title = lines[i].strip("|'").strip('|')
|
|
|
|
# Next line should be the date
|
|
i += 1
|
|
if i < len(lines) and lines[i].startswith('|'):
|
|
date = lines[i].strip('|')
|
|
|
|
i += 1
|
|
if i < len(lines) and lines[i].startswith('|'):
|
|
# characters = lines[i].strip('|')
|
|
|
|
if ep_number in {2, 3}:
|
|
# gotta switch
|
|
results.append(Finito(episode_num1, date, title))
|
|
else:
|
|
results.append(Finito(episode_num1, title, date))
|
|
i += 1
|
|
return results
|
|
|
|
def parse_17(wiki_text):
|
|
results = []
|
|
lines = wiki_text.split('\n')
|
|
|
|
for line in lines:
|
|
if line.startswith('|') and not line.startswith('|-') and not line.startswith('!'):
|
|
# Split the line by '|' and remove empty strings
|
|
parts = [part.strip() for part in line.split('|') if part.strip()]
|
|
|
|
if len(parts) >= 3:
|
|
try:
|
|
number = int(parts[0])
|
|
title = parts[1].strip("'")
|
|
|
|
# Look ahead for date in next row if this row doesn't have it
|
|
date = None
|
|
if 'rowspan=' in parts[2]:
|
|
date = parts[2].split('|')[1].strip()
|
|
else:
|
|
date = parts[2].strip()
|
|
|
|
results.append(Finito(number, date, title))
|
|
except (ValueError, IndexError):
|
|
continue
|
|
|
|
return results
|
|
|
|
|
|
with open(f'{sys.argv[1]}.wiki', 'r') as fp:
|
|
wiki_source = fp.read()
|
|
|
|
n = int(sys.argv[1]) if sys.argv[1] != '9b' else -1
|
|
if n in {1, 2, 3}:
|
|
episodes = list(parse_melevisione_data(wiki_source, n))
|
|
else:
|
|
episodes = list(parse_wiki_source(wiki_source, n))
|
|
|
|
with open(f'{sys.argv[1]}.txt', 'w') as fp:
|
|
fp.write(json.dumps(episodes, indent=4))
|
|
pp.pp(episodes)
|