import re import json import sys import pprint as pp from collections import namedtuple Finito = namedtuple('Finito', ['Number', 'Date', 'Title']) def parse_wiki_source(wiki_source, stagione): Episode = namedtuple('Episode', ['N_tot', 'N', 'Prima_TV_ita', 'Nome_della_puntata']) pattern = re.compile( r'\|\s*(\d+)\s*\|\s*([^\|]*)\s*\|\s*([^\|]+)\s*\|\s*([^\|]+)\s*\|' ) other = re.compile(r'\|\s*(\d+)\s*\|\|\s*(\d+)\s*\n\|\'\'([^\']+)\'\'\n\|(\d{1,2}\s+\w+\s+\d{4})\n\|([^\n]+)') matches = pattern.findall(wiki_source) matches2 = other.findall(wiki_source) episodes = [] for match in matches: print(f'----------------------- {list(match)}') if stagione == 8: n_tot, prima_tv_ita, nome_della_puntata, n = match elif stagione not in {13, 14, 15, 16, 17}: n_tot, n, prima_tv_ita, nome_della_puntata = match else: n_tot, nome_della_puntata, prima_tv_ita, n = [m.strip() for m in match] if n.endswith('2011') or n.endswith('2012'): nome_della_puntata = prima_tv_ita prima_tv_ita = n if prima_tv_ita.startswith('rowspan'): prima_tv_ita = n nome_della_puntata = nome_della_puntata.strip(']]').strip("'").strip(']').replace("''", '') print('FFFFFFF', list(match)) if n.strip() == '': n = None episodes.append(Episode(n_tot, n, prima_tv_ita, nome_della_puntata)) for e in episodes: e_ = Finito(e.N_tot, e.Prima_TV_ita.strip(), e.Nome_della_puntata.strip()) yield e_ def parse_melevisione_data(text, ep_number): results = [] lines = text.split('\n') i = 0 while i < len(lines): if lines[i].startswith('|') and '||' in lines[i]: # This line contains episode numbers numbers = lines[i].strip('|').split('||') if len(numbers) >= 1: episode_num1 = numbers[0].strip() # episode_num2 = numbers[1].strip() if len(numbers) >= 2 else '' # Next line should be the title i += 1 if i < len(lines) and (lines[i].startswith("|''") or lines[i].startswith('|')): title = lines[i].strip("|'").strip('|') # Next line should be the date i += 1 if i < len(lines) and lines[i].startswith('|'): date = lines[i].strip('|') i += 1 if i < len(lines) and lines[i].startswith('|'): # characters = lines[i].strip('|') if ep_number in {2, 3}: # gotta switch results.append(Finito(episode_num1, date, title)) else: results.append(Finito(episode_num1, title, date)) i += 1 return results def parse_17(wiki_text): results = [] lines = wiki_text.split('\n') for line in lines: if line.startswith('|') and not line.startswith('|-') and not line.startswith('!'): # Split the line by '|' and remove empty strings parts = [part.strip() for part in line.split('|') if part.strip()] if len(parts) >= 3: try: number = int(parts[0]) title = parts[1].strip("'") # Look ahead for date in next row if this row doesn't have it date = None if 'rowspan=' in parts[2]: date = parts[2].split('|')[1].strip() else: date = parts[2].strip() results.append(Finito(number, date, title)) except (ValueError, IndexError): continue return results with open(f'{sys.argv[1]}.wiki', 'r') as fp: wiki_source = fp.read() n = int(sys.argv[1]) if sys.argv[1] != '9b' else -1 if n in {1, 2, 3}: episodes = list(parse_melevisione_data(wiki_source, n)) else: episodes = list(parse_wiki_source(wiki_source, n)) with open(f'{sys.argv[1]}.txt', 'w') as fp: fp.write(json.dumps(episodes, indent=4)) pp.pp(episodes)