Melevisione/parse.py

import re
import json
import sys
import pprint as pp
from collections import namedtuple

Finito = namedtuple('Finito', ['Number', 'Date', 'Title'])


def parse_wiki_source(wiki_source, stagione):
    Episode = namedtuple('Episode', ['N_tot', 'N', 'Prima_TV_ita', 'Nome_della_puntata'])

    pattern = re.compile(
        r'\|\s*(\d+)\s*\|\s*([^\|]*)\s*\|\s*([^\|]+)\s*\|\s*([^\|]+)\s*\|'
    )
    other = re.compile(r'\|\s*(\d+)\s*\|\|\s*(\d+)\s*\n\|\'\'([^\']+)\'\'\n\|(\d{1,2}\s+\w+\s+\d{4})\n\|([^\n]+)')

    matches = pattern.findall(wiki_source)
    matches2 = other.findall(wiki_source)

    episodes = []
    for match in matches:
        print(f'----------------------- {list(match)}')
        if stagione == 8:
            n_tot, prima_tv_ita, nome_della_puntata, n = match
        elif stagione not in {13, 14, 15, 16, 17}:
            n_tot, n, prima_tv_ita, nome_della_puntata = match
        else:
            n_tot, nome_della_puntata, prima_tv_ita,  n = [m.strip() for m in match]
            if n.endswith('2011') or n.endswith('2012'):
                nome_della_puntata = prima_tv_ita
                prima_tv_ita = n
            if prima_tv_ita.startswith('rowspan'):
                prima_tv_ita = n
            nome_della_puntata = nome_della_puntata.strip(']]').strip("'").strip(']').replace("''", '')
        print('FFFFFFF', list(match))

        if n.strip() == '':
            n = None
        episodes.append(Episode(n_tot, n, prima_tv_ita, nome_della_puntata))

    for e in episodes:
        e_ = Finito(e.N_tot, e.Prima_TV_ita.strip(), e.Nome_della_puntata.strip())
        yield e_


def parse_melevisione_data(text, ep_number):
    results = []
    lines = text.split('\n')
    i = 0
    while i < len(lines):
        if lines[i].startswith('|') and '||' in lines[i]:
            # This line contains episode numbers
            numbers = lines[i].strip('|').split('||')
            if len(numbers) >= 1:
                episode_num1 = numbers[0].strip()
                # episode_num2 = numbers[1].strip() if len(numbers) >= 2 else ''

            # Next line should be the title
            i += 1
            if i < len(lines) and (lines[i].startswith("|''") or lines[i].startswith('|')):
                title = lines[i].strip("|'").strip('|')

                # Next line should be the date
                i += 1
                if i < len(lines) and lines[i].startswith('|'):
                    date = lines[i].strip('|')

                    i += 1
                    if i < len(lines) and lines[i].startswith('|'):
                        # characters = lines[i].strip('|')

                        if ep_number in {2, 3}:
                            # gotta switch
                            results.append(Finito(episode_num1, date, title))
                        else:
                            results.append(Finito(episode_num1, title, date))
        i += 1
    return results

def parse_17(wiki_text):
    results = []
    lines = wiki_text.split('\n')
    
    for line in lines:
        if line.startswith('|') and not line.startswith('|-') and not line.startswith('!'):
            # Split the line by '|' and remove empty strings
            parts = [part.strip() for part in line.split('|') if part.strip()]
            
            if len(parts) >= 3:
                try:
                    number = int(parts[0])
                    title = parts[1].strip("'")
                    
                    # Look ahead for date in next row if this row doesn't have it
                    date = None
                    if 'rowspan=' in parts[2]:
                        date = parts[2].split('|')[1].strip()
                    else:
                        date = parts[2].strip()
                    
                    results.append(Finito(number, date, title))
                except (ValueError, IndexError):
                    continue
    
    return results


with open(f'{sys.argv[1]}.wiki', 'r') as fp:
    wiki_source = fp.read()

n = int(sys.argv[1]) if sys.argv[1] != '9b' else -1
if n in {1, 2, 3}:
    episodes = list(parse_melevisione_data(wiki_source, n))
else:
    episodes = list(parse_wiki_source(wiki_source, n))

with open(f'{sys.argv[1]}.txt', 'w') as fp:
    fp.write(json.dumps(episodes, indent=4))
    pp.pp(episodes)
first 2024-11-05 11:47:28 +01:00			`import re`
			`import json`
			`import sys`
			`import pprint as pp`
			`from collections import namedtuple`

			`Finito = namedtuple('Finito', ['Number', 'Date', 'Title'])`


fixed s 8 2024-11-05 14:44:10 +01:00			`def parse_wiki_source(wiki_source, stagione):`
first 2024-11-05 11:47:28 +01:00			`Episode = namedtuple('Episode', ['N_tot', 'N', 'Prima_TV_ita', 'Nome_della_puntata'])`

			`pattern = re.compile(`
			`r'\\|\s(\d+)\s\\|\s([^\\|])\s\\|\s([^\\|]+)\s\\|\s([^\\|]+)\s*\\|'`
			`)`
			`other = re.compile(r'\\|\s(\d+)\s\\|\\|\s(\d+)\s\n\\|\'\'([^\']+)\'\'\n\\|(\d{1,2}\s+\w+\s+\d{4})\n\\|([^\n]+)')`

			`matches = pattern.findall(wiki_source)`
			`matches2 = other.findall(wiki_source)`

			`episodes = []`
			`for match in matches:`
fixed s 8 2024-11-05 14:44:10 +01:00			`print(f'----------------------- {list(match)}')`
			`if stagione == 8:`
			`n_tot, prima_tv_ita, nome_della_puntata, n = match`
			`elif stagione not in {13, 14, 15, 16, 17}:`
first 2024-11-05 11:47:28 +01:00			`n_tot, n, prima_tv_ita, nome_della_puntata = match`
			`else:`
			`n_tot, nome_della_puntata, prima_tv_ita, n = [m.strip() for m in match]`
fixed s 8 2024-11-05 14:44:10 +01:00			`if n.endswith('2011') or n.endswith('2012'):`
first 2024-11-05 11:47:28 +01:00			`nome_della_puntata = prima_tv_ita`
			`prima_tv_ita = n`
			`if prima_tv_ita.startswith('rowspan'):`
			`prima_tv_ita = n`
			`nome_della_puntata = nome_della_puntata.strip(']]').strip("'").strip(']').replace("''", '')`
			`print('FFFFFFF', list(match))`

			`if n.strip() == '':`
			`n = None`
			`episodes.append(Episode(n_tot, n, prima_tv_ita, nome_della_puntata))`

			`for e in episodes:`
			`e_ = Finito(e.N_tot, e.Prima_TV_ita.strip(), e.Nome_della_puntata.strip())`
			`yield e_`


			`def parse_melevisione_data(text, ep_number):`
			`results = []`
			`lines = text.split('\n')`
			`i = 0`
			`while i < len(lines):`
			`if lines[i].startswith('\|') and '\|\|' in lines[i]:`
			`# This line contains episode numbers`
			`numbers = lines[i].strip('\|').split('\|\|')`
			`if len(numbers) >= 1:`
			`episode_num1 = numbers[0].strip()`
			`# episode_num2 = numbers[1].strip() if len(numbers) >= 2 else ''`

			`# Next line should be the title`
			`i += 1`
			`if i < len(lines) and (lines[i].startswith("\|''") or lines[i].startswith('\|')):`
			`title = lines[i].strip("\|'").strip('\|')`

			`# Next line should be the date`
			`i += 1`
			`if i < len(lines) and lines[i].startswith('\|'):`
			`date = lines[i].strip('\|')`

			`i += 1`
			`if i < len(lines) and lines[i].startswith('\|'):`
			`# characters = lines[i].strip('\|')`

			`if ep_number in {2, 3}:`
			`# gotta switch`
			`results.append(Finito(episode_num1, date, title))`
			`else:`
			`results.append(Finito(episode_num1, title, date))`
			`i += 1`
			`return results`

			`def parse_17(wiki_text):`
			`results = []`
			`lines = wiki_text.split('\n')`

			`for line in lines:`
			`if line.startswith('\|') and not line.startswith('\|-') and not line.startswith('!'):`
			`# Split the line by '\|' and remove empty strings`
			`parts = [part.strip() for part in line.split('\|') if part.strip()]`

			`if len(parts) >= 3:`
			`try:`
			`number = int(parts[0])`
			`title = parts[1].strip("'")`

			`# Look ahead for date in next row if this row doesn't have it`
			`date = None`
			`if 'rowspan=' in parts[2]:`
			`date = parts[2].split('\|')[1].strip()`
			`else:`
			`date = parts[2].strip()`

			`results.append(Finito(number, date, title))`
			`except (ValueError, IndexError):`
			`continue`

			`return results`


			`with open(f'{sys.argv[1]}.wiki', 'r') as fp:`
			`wiki_source = fp.read()`

more 2024-11-05 14:40:30 +01:00			`n = int(sys.argv[1]) if sys.argv[1] != '9b' else -1`
			`if n in {1, 2, 3}:`
			`episodes = list(parse_melevisione_data(wiki_source, n))`
first 2024-11-05 11:47:28 +01:00			`else:`
more 2024-11-05 14:40:30 +01:00			`episodes = list(parse_wiki_source(wiki_source, n))`
first 2024-11-05 11:47:28 +01:00
			`with open(f'{sys.argv[1]}.txt', 'w') as fp:`
			`fp.write(json.dumps(episodes, indent=4))`
			`pp.pp(episodes)`