Melevisione/parse.py

121 lines
4.2 KiB
Python
Raw Normal View History

2024-11-05 11:47:28 +01:00
import re
import json
import sys
import pprint as pp
from collections import namedtuple
Finito = namedtuple('Finito', ['Number', 'Date', 'Title'])
2024-11-05 14:44:10 +01:00
def parse_wiki_source(wiki_source, stagione):
2024-11-05 11:47:28 +01:00
Episode = namedtuple('Episode', ['N_tot', 'N', 'Prima_TV_ita', 'Nome_della_puntata'])
pattern = re.compile(
r'\|\s*(\d+)\s*\|\s*([^\|]*)\s*\|\s*([^\|]+)\s*\|\s*([^\|]+)\s*\|'
)
other = re.compile(r'\|\s*(\d+)\s*\|\|\s*(\d+)\s*\n\|\'\'([^\']+)\'\'\n\|(\d{1,2}\s+\w+\s+\d{4})\n\|([^\n]+)')
matches = pattern.findall(wiki_source)
matches2 = other.findall(wiki_source)
episodes = []
for match in matches:
2024-11-05 14:44:10 +01:00
print(f'----------------------- {list(match)}')
if stagione == 8:
n_tot, prima_tv_ita, nome_della_puntata, n = match
elif stagione not in {13, 14, 15, 16, 17}:
2024-11-05 11:47:28 +01:00
n_tot, n, prima_tv_ita, nome_della_puntata = match
else:
n_tot, nome_della_puntata, prima_tv_ita, n = [m.strip() for m in match]
2024-11-05 14:44:10 +01:00
if n.endswith('2011') or n.endswith('2012'):
2024-11-05 11:47:28 +01:00
nome_della_puntata = prima_tv_ita
prima_tv_ita = n
if prima_tv_ita.startswith('rowspan'):
prima_tv_ita = n
nome_della_puntata = nome_della_puntata.strip(']]').strip("'").strip(']').replace("''", '')
print('FFFFFFF', list(match))
if n.strip() == '':
n = None
episodes.append(Episode(n_tot, n, prima_tv_ita, nome_della_puntata))
for e in episodes:
e_ = Finito(e.N_tot, e.Prima_TV_ita.strip(), e.Nome_della_puntata.strip())
yield e_
def parse_melevisione_data(text, ep_number):
results = []
lines = text.split('\n')
i = 0
while i < len(lines):
if lines[i].startswith('|') and '||' in lines[i]:
# This line contains episode numbers
numbers = lines[i].strip('|').split('||')
if len(numbers) >= 1:
episode_num1 = numbers[0].strip()
# episode_num2 = numbers[1].strip() if len(numbers) >= 2 else ''
# Next line should be the title
i += 1
if i < len(lines) and (lines[i].startswith("|''") or lines[i].startswith('|')):
title = lines[i].strip("|'").strip('|')
# Next line should be the date
i += 1
if i < len(lines) and lines[i].startswith('|'):
date = lines[i].strip('|')
i += 1
if i < len(lines) and lines[i].startswith('|'):
# characters = lines[i].strip('|')
if ep_number in {2, 3}:
# gotta switch
results.append(Finito(episode_num1, date, title))
else:
results.append(Finito(episode_num1, title, date))
i += 1
return results
def parse_17(wiki_text):
results = []
lines = wiki_text.split('\n')
for line in lines:
if line.startswith('|') and not line.startswith('|-') and not line.startswith('!'):
# Split the line by '|' and remove empty strings
parts = [part.strip() for part in line.split('|') if part.strip()]
if len(parts) >= 3:
try:
number = int(parts[0])
title = parts[1].strip("'")
# Look ahead for date in next row if this row doesn't have it
date = None
if 'rowspan=' in parts[2]:
date = parts[2].split('|')[1].strip()
else:
date = parts[2].strip()
results.append(Finito(number, date, title))
except (ValueError, IndexError):
continue
return results
with open(f'{sys.argv[1]}.wiki', 'r') as fp:
wiki_source = fp.read()
2024-11-05 14:40:30 +01:00
n = int(sys.argv[1]) if sys.argv[1] != '9b' else -1
if n in {1, 2, 3}:
episodes = list(parse_melevisione_data(wiki_source, n))
2024-11-05 11:47:28 +01:00
else:
2024-11-05 14:40:30 +01:00
episodes = list(parse_wiki_source(wiki_source, n))
2024-11-05 11:47:28 +01:00
with open(f'{sys.argv[1]}.txt', 'w') as fp:
fp.write(json.dumps(episodes, indent=4))
pp.pp(episodes)