Melevisione/parse.py

import re
import json
import sys
import pprint as pp
from collections import namedtuple

Finito = namedtuple('Finito', ['Number', 'Date', 'Title'])


def parse_wiki_source(wiki_source, ep_):
    Episode = namedtuple('Episode', ['N_tot', 'N', 'Prima_TV_ita', 'Nome_della_puntata'])

    pattern = re.compile(
        r'\|\s*(\d+)\s*\|\s*([^\|]*)\s*\|\s*([^\|]+)\s*\|\s*([^\|]+)\s*\|'
    )
    other = re.compile(r'\|\s*(\d+)\s*\|\|\s*(\d+)\s*\n\|\'\'([^\']+)\'\'\n\|(\d{1,2}\s+\w+\s+\d{4})\n\|([^\n]+)')

    matches = pattern.findall(wiki_source)
    matches2 = other.findall(wiki_source)

    episodes = []
    for match in matches:
        if ep_ not in {13, 14, 15, 16, 17}:
            n_tot, n, prima_tv_ita, nome_della_puntata = match
        else:
            n_tot, nome_della_puntata, prima_tv_ita,  n = [m.strip() for m in match]
            if n.endswith('2011') or n.endswith('2012'): # or n.endswith('2013') or n.endswith('2014') or n.endswith('2016'):
                nome_della_puntata = prima_tv_ita
                prima_tv_ita = n
            if prima_tv_ita.startswith('rowspan'):
                prima_tv_ita = n
            nome_della_puntata = nome_della_puntata.strip(']]').strip("'").strip(']').replace("''", '')
        print('FFFFFFF', list(match))

        if n.strip() == '':
            n = None
        episodes.append(Episode(n_tot, n, prima_tv_ita, nome_della_puntata))

    for e in episodes:
        e_ = Finito(e.N_tot, e.Prima_TV_ita.strip(), e.Nome_della_puntata.strip())
        yield e_


def parse_melevisione_data(text, ep_number):
    results = []
    lines = text.split('\n')
    i = 0
    while i < len(lines):
        if lines[i].startswith('|') and '||' in lines[i]:
            # This line contains episode numbers
            numbers = lines[i].strip('|').split('||')
            if len(numbers) >= 1:
                episode_num1 = numbers[0].strip()
                # episode_num2 = numbers[1].strip() if len(numbers) >= 2 else ''

            # Next line should be the title
            i += 1
            if i < len(lines) and (lines[i].startswith("|''") or lines[i].startswith('|')):
                title = lines[i].strip("|'").strip('|')

                # Next line should be the date
                i += 1
                if i < len(lines) and lines[i].startswith('|'):
                    date = lines[i].strip('|')

                    i += 1
                    if i < len(lines) and lines[i].startswith('|'):
                        # characters = lines[i].strip('|')

                        if ep_number in {2, 3}:
                            # gotta switch
                            results.append(Finito(episode_num1, date, title))
                        else:
                            results.append(Finito(episode_num1, title, date))
        i += 1
    return results

def parse_17(wiki_text):
    results = []
    lines = wiki_text.split('\n')

    for line in lines:
        if line.startswith('|') and not line.startswith('|-') and not line.startswith('!'):
            # Split the line by '|' and remove empty strings
            parts = [part.strip() for part in line.split('|') if part.strip()]

            if len(parts) >= 3:
                try:
                    number = int(parts[0])
                    title = parts[1].strip("'")

                    # Look ahead for date in next row if this row doesn't have it
                    date = None
                    if 'rowspan=' in parts[2]:
                        date = parts[2].split('|')[1].strip()
                    else:
                        date = parts[2].strip()

                    results.append(Finito(number, date, title))
                except (ValueError, IndexError):
                    continue

    return results


with open(f'{sys.argv[1]}.wiki', 'r') as fp:
    wiki_source = fp.read()

if int(sys.argv[1]) in {1, 2, 3}:
    episodes = list(parse_melevisione_data(wiki_source, int(sys.argv[1])))
else:
    episodes = list(parse_wiki_source(wiki_source, int(sys.argv[1])))

with open(f'{sys.argv[1]}.txt', 'w') as fp:
    fp.write(json.dumps(episodes, indent=4))
    pp.pp(episodes)