UniTO/tesi/conv.py
2020-04-12 17:30:03 +02:00

80 lines
2.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import re
from sys import argv
try:
allsymbols = json.load(open('./unicode-latex.json'))
except:
allsymbols = json.load(open('../unicode-latex.json'))
mysymbols = ['', '', '', '', '', '', '', '', '', 'ε', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', 'ʲ', '', 'π', 'α', 'β', '', 'σ', '', '', '', '', '', '', '', '', '', '', '', 'ˡ', '', '', '', '', '', '' ]
extrasymbols = {'': '\llbracket', '': r'\rrbracket', '̸': '\neg', '¬̸': '\neg', '': '\in ', '': '_S', '': '_T'}
symbols = {s: allsymbols[s] for s in mysymbols}
symbols.update(extrasymbols)
mathsymbols = {s: '$'+v+'$' for s, v in symbols.items()}
def read_by_char(fname):
# Yield character and True/False if inside mathmode block
mathmode = False
mathmode_begin = set(['\\begin{equation*}', '\\begin{equation}', '\[', '\\begin{mathpar}'])
mathmode_end = set(['\\end{equation*}', '\\end{equation}', '\]', '\\end{mathpar}'])
cnt = 0
with open(fname, 'r') as fp:
for line in fp.readlines():
cnt += 1
words = [w.strip() for w in line.split(' ')]
if mathmode_begin.intersection(words):
assert mathmode == False
mathmode = True
if mathmode_end.intersection(words):
assert mathmode == True, f'Line: {words}, number: {cnt}'
mathmode = False
for ch in line:
yield ch, mathmode
def convert(ch, mathmode):
if not mathmode:
return mathsymbols[ch] if ch in mathsymbols else ch
else:
return symbols[ch] if ch in symbols else ch
def latex_errors_replacements(charlist):
text = ''.join(charlist).split(' ')
replacements = {
'\n\end{comment}\n\end{enumerate}\n\end{enumerate}\n\n\subsection{Symbolic': '\n\end{comment}\n\n\subsection{Symbolic',
}
r_set = set(replacements.keys())
for word in text:
it = r_set.intersection(set([word]))
if it:
yield from replacements[it.pop()]
else:
yield from word
yield ' '
def ll_rr_bracket(charlist):
llrr_mode = False
for i, ch in enumerate(charlist):
if ch == '\\':
if charlist[i:i+10] == '\llbracket':
assert llrr_mode is False ; llrr_mode = True
elif charlist[i:i+10] == '\rrbracket':
assert llrr_mode is True ; llrr_mode = False
if not (llrr_mode and ch == '$'):
yield ch
# convert symbols except the one requiring math mode modifiers
firstpass = [convert(*c) for c in read_by_char(argv[1])]
# remove a latex error
secondpass = latex_errors_replacements(firstpass)
thirdpass = ll_rr_bracket(list(secondpass))
newfile = ''.join(thirdpass)
with open(argv[2], 'w') as f:
f.write(newfile)