UniTO/tesi/conv.py

import json
import re
from sys import argv

allsymbols = json.load(open('./unicode-latex.json'))
mysymbols = ['≡', '≠', '≼', '→', '←', '⊀', '⋠', '≺', '∀', '∈', 'ε','₀', '₂', '₁', '₃', 'ₐ', 'ₖ', 'ₘ', 'ₙ', 'ᵢ', 'ⁱ', '⋮']

symbols = {s: allsymbols[s] for s in mysymbols}
mathsymbols = {s: '$'+allsymbols[s]+'$' for s in symbols}

def read_by_char(fname):
    # Yield character and True/False if inside mathmode block
    mathmode = False
    mathmode_begin = set(['\\begin{equation*}', '\\begin{equation}'])
    mathmode_end = set(['\\end{equation*}', '\\end{equation}'])
    cnt = 0
    with open(fname, 'r') as fp:
        for line in fp.readlines():
            cnt += 1
            words = [w.strip() for w in line.split(' ')]

            if mathmode_begin.intersection(words):
                assert mathmode == False
                mathmode = True
            if mathmode_end.intersection(words):
                assert mathmode == True, f'Line: {words}, number: {cnt}'
                mathmode = False

            for ch in line:
                yield ch, mathmode

def convert(ch, mathmode):
    if not mathmode:
        return mathsymbols[ch] if ch in mathsymbols else ch
    else:
        return symbols[ch] if ch in symbols else ch

def latex_errors_replacements(charlist):
    text = ''.join(charlist).split(' ')
    replacements = {'\n\end{comment}\n\end{enumerate}\n\end{enumerate}\n\n\subsection{Symbolic':
                    '\n\end{comment}\n\n\subsection{Symbolic'}
    r_set = set(replacements.keys())
    for word in text:
        it = r_set.intersection(set([word]))
        if it:
            yield from replacements[it.pop()]
        else:
            yield from word
        yield ' '

# convert symbols except the one requiring math mode modifiers
firstpass = [convert(*c) for c in read_by_char(argv[1])]
# remove a latex error
secondpass = latex_errors_replacements(firstpass)

newfile = ''.join(secondpass)
with open(argv[2], 'w') as f:
    f.write(newfile)