2020-03-03 17:18:40 +01:00
|
|
|
|
% Created 2020-03-03 Tue 17:18
|
2020-02-24 14:36:26 +01:00
|
|
|
|
% Intended LaTeX compiler: pdflatex
|
|
|
|
|
\documentclass[11pt]{article}
|
|
|
|
|
\usepackage[utf8]{inputenc}
|
|
|
|
|
\usepackage[T1]{fontenc}
|
|
|
|
|
\usepackage{graphicx}
|
|
|
|
|
\usepackage{grffile}
|
|
|
|
|
\usepackage{longtable}
|
|
|
|
|
\usepackage{wrapfig}
|
|
|
|
|
\usepackage{rotating}
|
|
|
|
|
\usepackage[normalem]{ulem}
|
|
|
|
|
\usepackage{amsmath}
|
|
|
|
|
\usepackage{textcomp}
|
|
|
|
|
\usepackage{amssymb}
|
|
|
|
|
\usepackage{capt-of}
|
|
|
|
|
\usepackage{hyperref}
|
|
|
|
|
\usepackage{algorithm}
|
2020-02-24 19:46:00 +01:00
|
|
|
|
\usepackage{comment}
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\usepackage{algpseudocode}
|
|
|
|
|
\usepackage{amsmath,amssymb,amsthm}
|
|
|
|
|
\newtheorem{definition}{Definition}
|
|
|
|
|
\usepackage{graphicx}
|
|
|
|
|
\usepackage{listings}
|
|
|
|
|
\usepackage{color}
|
|
|
|
|
\author{Francesco Mecca}
|
|
|
|
|
\date{}
|
2020-03-03 17:18:40 +01:00
|
|
|
|
\title{Translation Verification of the pattern matching compiler}
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\hypersetup{
|
|
|
|
|
pdfauthor={Francesco Mecca},
|
2020-03-03 17:18:40 +01:00
|
|
|
|
pdftitle={Translation Verification of the pattern matching compiler},
|
2020-02-24 14:36:26 +01:00
|
|
|
|
pdfkeywords={},
|
|
|
|
|
pdfsubject={},
|
|
|
|
|
pdfcreator={Emacs 26.3 (Org mode 9.1.9)},
|
|
|
|
|
pdflang={English}}
|
|
|
|
|
\begin{document}
|
|
|
|
|
|
|
|
|
|
\maketitle
|
2020-02-24 19:46:00 +01:00
|
|
|
|
\begin{comment}
|
2020-03-02 14:46:37 +01:00
|
|
|
|
\section{{\bfseries\sffamily TODO} Scaletta [1/5]}
|
2020-03-03 17:18:40 +01:00
|
|
|
|
\label{sec:org7578cff}
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\begin{itemize}
|
|
|
|
|
\item[{$\boxtimes$}] Abstract
|
2020-02-24 19:46:00 +01:00
|
|
|
|
\item[{$\boxminus$}] Background [40\%]
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\begin{itemize}
|
2020-03-03 17:18:40 +01:00
|
|
|
|
\item[{$\boxtimes$}]
|
|
|
|
|
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\item[{$\square$}] Lambda code [0\%]
|
|
|
|
|
\begin{itemize}
|
2020-03-02 14:46:37 +01:00
|
|
|
|
\item[{$\square$}] Compiler optimizations
|
|
|
|
|
\item[{$\square$}] other instructions
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\end{itemize}
|
2020-03-02 14:46:37 +01:00
|
|
|
|
\item[{$\boxtimes$}] Pattern matching
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\item[{$\square$}] Symbolic execution
|
2020-03-02 14:46:37 +01:00
|
|
|
|
\item[{$\square$}] Translation Validation
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\end{itemize}
|
2020-03-02 14:46:37 +01:00
|
|
|
|
\item[{$\square$}] Translation validation of the Pattern Matching Compiler
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\begin{itemize}
|
|
|
|
|
\item[{$\square$}] Source translation
|
|
|
|
|
\begin{itemize}
|
|
|
|
|
\item[{$\square$}] Formal Grammar
|
|
|
|
|
\item[{$\square$}] Compilation of source patterns
|
2020-03-03 17:18:40 +01:00
|
|
|
|
\item[{$\square$}] Rest?
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\end{itemize}
|
|
|
|
|
\item[{$\square$}] Target translation
|
|
|
|
|
\begin{itemize}
|
|
|
|
|
\item[{$\square$}] Formal Grammar
|
|
|
|
|
\item[{$\square$}] Symbolic execution of the lambda target
|
|
|
|
|
\end{itemize}
|
|
|
|
|
\item[{$\square$}] Equivalence between source and target
|
|
|
|
|
\end{itemize}
|
2020-03-02 14:46:37 +01:00
|
|
|
|
\item[{$\square$}] Proof of correctness
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\item[{$\square$}] Practical results
|
|
|
|
|
\end{itemize}
|
|
|
|
|
|
2020-02-24 19:46:00 +01:00
|
|
|
|
\end{comment}
|
2020-02-24 14:36:26 +01:00
|
|
|
|
|
|
|
|
|
\begin{abstract}
|
|
|
|
|
|
2020-03-03 17:18:40 +01:00
|
|
|
|
This dissertation presents an algorithm for the translation validation of the
|
2020-02-24 14:36:26 +01:00
|
|
|
|
pattern matching compiler. Given the source representation of the target program and the
|
|
|
|
|
target program compiled in untyped lambda form, the algoritmhm is capable of modelling
|
|
|
|
|
the source program in terms of symbolic constraints on it's branches and apply symbolic
|
|
|
|
|
execution on the untyped lambda representation in order to validate wheter the compilation
|
|
|
|
|
produced a valid result.
|
|
|
|
|
In this context a valid result means that for every input in the domain of the source
|
|
|
|
|
program the untyped lambda translation produces the same output as the source program.
|
|
|
|
|
The input of the program is modelled in terms of symbolic constraints closely related to
|
2020-03-03 17:18:40 +01:00
|
|
|
|
the runtime representation of objects and the output consists of OCaml code
|
2020-02-24 14:36:26 +01:00
|
|
|
|
blackboxes that are not evaluated in the context of the verification.
|
|
|
|
|
|
|
|
|
|
\end{abstract}
|
|
|
|
|
|
|
|
|
|
\section{Background}
|
2020-03-03 17:18:40 +01:00
|
|
|
|
\label{sec:org5b6accf}
|
2020-02-24 14:36:26 +01:00
|
|
|
|
|
2020-03-03 17:18:40 +01:00
|
|
|
|
\subsection{}
|
|
|
|
|
\label{sec:org3c9e604}
|
|
|
|
|
Objective Caml () is a dialect of the ML (Meta-Language) family of programming
|
2020-02-24 14:36:26 +01:00
|
|
|
|
languages.
|
2020-03-03 17:18:40 +01:00
|
|
|
|
shares many features with other dialects of ML, such as SML and Caml Light,
|
2020-02-24 14:36:26 +01:00
|
|
|
|
The main features of ML languages are the use of the Hindley-Milner type system that
|
|
|
|
|
provides many advantages with respect to static type systems of traditional imperative and object
|
|
|
|
|
oriented language such as C, C++ and Java, such as:
|
|
|
|
|
\begin{itemize}
|
2020-03-02 14:46:37 +01:00
|
|
|
|
\item Polymorphism: in certain scenarios a function can accept more than one
|
2020-02-24 14:36:26 +01:00
|
|
|
|
type for the input parameters. For example a function that computes the lenght of a
|
|
|
|
|
list doesn't need to inspect the type of the elements of the list and for this reason
|
2020-03-02 14:46:37 +01:00
|
|
|
|
a List.length function can accept lists of integers, lists of strings and in general
|
|
|
|
|
lists of any type. Such languages offer polymorphic functions through subtyping at
|
2020-02-24 14:36:26 +01:00
|
|
|
|
runtime only, while other languages such as C++ offer polymorphism through compile
|
|
|
|
|
time templates and function overloading.
|
|
|
|
|
With the Hindley-Milner type system each well typed function can have more than one
|
|
|
|
|
type but always has a unique best type, called the \emph{principal type}.
|
|
|
|
|
For example the principal type of the List.length function is "For any \emph{a}, function from
|
|
|
|
|
list of \emph{a} to \emph{int}" and \emph{a} is called the \emph{type parameter}.
|
|
|
|
|
\item Strong typing: Languages such as C and C++ allow the programmer to operate on data
|
|
|
|
|
without considering its type, mainly through pointers. Other languages such as C\#
|
|
|
|
|
and Go allow type erasure so at runtime the type of the data can't be queried.
|
|
|
|
|
In the case of programming languages using an Hindley-Milner type system the
|
|
|
|
|
programmer is not allowed to operate on data by ignoring or promoting its type.
|
|
|
|
|
\item Type Inference: the principal type of a well formed term can be inferred without any
|
|
|
|
|
annotation or declaration.
|
|
|
|
|
\item Algebraic data types: types that are modelled by the use of two
|
|
|
|
|
algebraic operations, sum and product.
|
|
|
|
|
A sum type is a type that can hold of many different types of
|
|
|
|
|
objects, but only one at a time. For example the sum type defined
|
|
|
|
|
as \emph{A + B} can hold at any moment a value of type A or a value of
|
|
|
|
|
type B. Sum types are also called tagged union or variants.
|
|
|
|
|
A product type is a type constructed as a direct product
|
|
|
|
|
of multiple types and contains at any moment one instance for
|
|
|
|
|
every type of its operands. Product types are also called tuples
|
|
|
|
|
or records. Algebraic data types can be recursive
|
|
|
|
|
in their definition and can be combined.
|
|
|
|
|
\end{itemize}
|
|
|
|
|
Moreover ML languages are functional, meaning that functions are
|
|
|
|
|
treated as first class citizens and variables are immutable,
|
|
|
|
|
although mutable statements and imperative constructs are permitted.
|
2020-03-03 17:18:40 +01:00
|
|
|
|
In addition to that features an object system, that provides
|
2020-02-24 14:36:26 +01:00
|
|
|
|
inheritance, subtyping and dynamic binding, and modules, that
|
|
|
|
|
provide a way to encapsulate definitions. Modules are checked
|
|
|
|
|
statically and can be reificated through functors.
|
|
|
|
|
|
2020-03-02 14:46:37 +01:00
|
|
|
|
\subsection{Lambda form compilation}
|
2020-03-03 17:18:40 +01:00
|
|
|
|
\label{sec:org6065c14}
|
2020-03-02 14:46:37 +01:00
|
|
|
|
\begin{comment}
|
2020-03-03 17:18:40 +01:00
|
|
|
|
https://dev.realworld.org/compiler-backend.html
|
2020-03-02 14:46:37 +01:00
|
|
|
|
\end{comment}
|
|
|
|
|
|
2020-03-03 17:18:40 +01:00
|
|
|
|
provides compilation in form of a byecode executable with an
|
2020-03-02 14:46:37 +01:00
|
|
|
|
optionally embeddable interpreter and a native executable that could
|
|
|
|
|
be statically linked to provide a single file executable.
|
|
|
|
|
|
2020-03-03 17:18:40 +01:00
|
|
|
|
After the typechecker has proven that the program is type safe,
|
|
|
|
|
the compiler lower the code to \emph{Lambda}, an s-expression based
|
2020-03-02 14:46:37 +01:00
|
|
|
|
language that assumes that its input has already been proved safe.
|
|
|
|
|
On the \emph{Lambda} representation of the source program, the compiler
|
|
|
|
|
performes a series of optimization passes before translating the
|
|
|
|
|
lambda form to assembly code.
|
|
|
|
|
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\begin{enumerate}
|
2020-03-03 17:18:40 +01:00
|
|
|
|
\item datatypes
|
|
|
|
|
\label{sec:org7b158eb}
|
2020-03-02 14:46:37 +01:00
|
|
|
|
|
2020-03-03 17:18:40 +01:00
|
|
|
|
Most native data types in , such as integers, tuples, lists,
|
2020-03-02 14:46:37 +01:00
|
|
|
|
records, can be seen as instances of the following definition
|
|
|
|
|
|
|
|
|
|
\begin{verbatim}
|
|
|
|
|
type t = Nil | One of int | Cons of int * t
|
|
|
|
|
\end{verbatim}
|
|
|
|
|
that is a type \emph{t} with three constructors that define its complete
|
|
|
|
|
signature.
|
|
|
|
|
Every constructor has an arity. Nil, a constructor of arity 0, is
|
|
|
|
|
called a constant constructor.
|
|
|
|
|
|
|
|
|
|
\item Lambda form types
|
2020-03-03 17:18:40 +01:00
|
|
|
|
\label{sec:org737fa2f}
|
|
|
|
|
A lambda form target file produced by the compiler consists of a
|
2020-03-02 14:46:37 +01:00
|
|
|
|
single s-expression. Every s-expression consist of \emph{(}, a sequence of
|
|
|
|
|
elements separated by a whitespace and a closing \emph{)}.
|
|
|
|
|
Elements of s-expressions are:
|
|
|
|
|
\begin{itemize}
|
|
|
|
|
\item Atoms: sequences of ascii letters, digits or symbols
|
|
|
|
|
\item Variables
|
|
|
|
|
\item Strings: enclosed in double quotes and possibly escaped
|
|
|
|
|
\item S-expressions: allowing arbitrary nesting
|
|
|
|
|
\end{itemize}
|
|
|
|
|
|
|
|
|
|
There are several numeric types:
|
|
|
|
|
\begin{itemize}
|
|
|
|
|
\item integers: that us either 31 or 63 bit two's complement arithmetic
|
|
|
|
|
depending on system word size, and also wrapping on overflow
|
|
|
|
|
\item 32 bit and 64 bit integers: that use 32-bit and 64-bit two's complement arithmetic
|
|
|
|
|
with wrap on overflow
|
|
|
|
|
\item big integers: offer integers with arbitrary precision
|
|
|
|
|
\item floats: that use IEEE754 double-precision (64-bit) arithmetic with
|
|
|
|
|
the addition of the literals \emph{infinity}, \emph{neg\_infinity} and \emph{nan}.
|
|
|
|
|
\end{itemize}
|
|
|
|
|
|
|
|
|
|
The are varios numeric operations defined:
|
|
|
|
|
|
|
|
|
|
\begin{itemize}
|
|
|
|
|
\item Arithmetic operations: +, -, *, /, \% (modulo), neg (unary negation)
|
|
|
|
|
\item Bitwise operations: \&, |, \^{}, <<, >> (zero-shifting), a>> (sign extending)
|
|
|
|
|
\item Numeric comparisons: <, >, <=, >=, ==
|
|
|
|
|
\end{itemize}
|
|
|
|
|
|
|
|
|
|
\item Functions
|
2020-03-03 17:18:40 +01:00
|
|
|
|
\label{sec:org369db83}
|
2020-03-02 14:46:37 +01:00
|
|
|
|
|
|
|
|
|
Functions are defined using the following syntax, and close over all
|
|
|
|
|
bindings in scope: (lambda (arg1 arg2 arg3) BODY)
|
|
|
|
|
and are applied using the following syntax: (apply FUNC ARG ARG ARG)
|
|
|
|
|
Evaluation is eager.
|
|
|
|
|
|
|
|
|
|
\item Bindings
|
2020-03-03 17:18:40 +01:00
|
|
|
|
\label{sec:org120bc74}
|
2020-03-02 14:46:37 +01:00
|
|
|
|
The atom \emph{let} introduces a sequence of bindings:
|
|
|
|
|
(let BINDING BINDING BINDING \ldots{} BODY)
|
|
|
|
|
|
|
|
|
|
\item Other atoms
|
2020-03-03 17:18:40 +01:00
|
|
|
|
\label{sec:org58bd28f}
|
2020-03-02 14:46:37 +01:00
|
|
|
|
TODO: if, switch, stringswitch\ldots{}
|
|
|
|
|
TODO: magari esempi
|
|
|
|
|
\end{enumerate}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
\subsection{Pattern matching}
|
2020-03-03 17:18:40 +01:00
|
|
|
|
\label{sec:org5d3b2f5}
|
2020-02-24 14:36:26 +01:00
|
|
|
|
|
|
|
|
|
Pattern matching is a widely adopted mechanism to interact with ADT.
|
|
|
|
|
C family languages provide branching on predicates through the use of
|
|
|
|
|
if statements and switch statements.
|
2020-02-24 19:46:00 +01:00
|
|
|
|
Pattern matching on the other hands express predicates through
|
|
|
|
|
syntactic templates that also allow to bind on data structures of
|
|
|
|
|
arbitrary shapes. One common example of pattern matching is the use of regular
|
2020-03-03 17:18:40 +01:00
|
|
|
|
expressions on strings. provides pattern matching on ADT and
|
2020-02-24 14:36:26 +01:00
|
|
|
|
primitive data types.
|
2020-02-24 19:46:00 +01:00
|
|
|
|
The result of a pattern matching operation is always one of:
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\begin{itemize}
|
2020-02-24 19:46:00 +01:00
|
|
|
|
\item this value does not match this pattern”
|
|
|
|
|
\item this value matches this pattern, resulting the following bindings of
|
|
|
|
|
names to values and the jump to the expression pointed at the
|
|
|
|
|
pattern.
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\end{itemize}
|
2020-02-24 19:46:00 +01:00
|
|
|
|
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\begin{verbatim}
|
2020-02-24 19:46:00 +01:00
|
|
|
|
type color = | Red | Blue | Green | Black | White
|
2020-02-24 14:36:26 +01:00
|
|
|
|
|
2020-02-24 19:46:00 +01:00
|
|
|
|
match color with
|
2020-02-24 14:36:26 +01:00
|
|
|
|
| Red -> print "red"
|
|
|
|
|
| Blue -> print "red"
|
|
|
|
|
| Green -> print "red"
|
2020-02-24 19:46:00 +01:00
|
|
|
|
| _ -> print "white or black"
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\end{verbatim}
|
|
|
|
|
|
2020-03-03 17:18:40 +01:00
|
|
|
|
provides tokens to express data destructoring.
|
2020-02-24 19:46:00 +01:00
|
|
|
|
For example we can examine the content of a list with patten matching
|
2020-02-24 14:36:26 +01:00
|
|
|
|
|
|
|
|
|
\begin{verbatim}
|
|
|
|
|
|
|
|
|
|
begin match list with
|
|
|
|
|
| [ ] -> print "empty list"
|
|
|
|
|
| element1 :: [ ] -> print "one element"
|
2020-02-24 19:46:00 +01:00
|
|
|
|
| (element1 :: element2) :: [ ] -> print "two elements"
|
2020-02-24 14:36:26 +01:00
|
|
|
|
| head :: tail-> print "head followed by many elements"
|
|
|
|
|
\end{verbatim}
|
|
|
|
|
|
2020-02-24 19:46:00 +01:00
|
|
|
|
Parenthesized patterns, such as the third one in the previous example,
|
|
|
|
|
matches the same value as the pattern without parenthesis.
|
|
|
|
|
|
|
|
|
|
The same could be done with tuples
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\begin{verbatim}
|
|
|
|
|
|
|
|
|
|
begin match tuple with
|
|
|
|
|
| (Some _, Some _) -> print "Pair of optional types"
|
2020-02-24 19:46:00 +01:00
|
|
|
|
| (Some _, None) | (None, Some _) -> print "Pair of optional types, one of which is null"
|
2020-02-24 14:36:26 +01:00
|
|
|
|
| (None, None) -> print "Pair of optional types, both null"
|
|
|
|
|
\end{verbatim}
|
|
|
|
|
|
2020-02-24 19:46:00 +01:00
|
|
|
|
The pattern pattern₁ | pattern₂ represents the logical "or" of the
|
|
|
|
|
two patterns pattern₁ and pattern₂. A value matches pattern₁ |
|
|
|
|
|
pattern₂ if it matches pattern₁ or pattern₂.
|
|
|
|
|
|
2020-02-24 14:36:26 +01:00
|
|
|
|
Pattern clauses can make the use of \emph{guards} to test predicates and
|
2020-02-24 19:46:00 +01:00
|
|
|
|
variables can captured (binded in scope).
|
2020-02-24 14:36:26 +01:00
|
|
|
|
|
|
|
|
|
\begin{verbatim}
|
|
|
|
|
|
|
|
|
|
begin match token_list with
|
2020-02-24 19:46:00 +01:00
|
|
|
|
| "switch"::var::"{"::rest -> ...
|
|
|
|
|
| "case"::":"::var::rest when is_int var -> ...
|
|
|
|
|
| "case"::":"::var::rest when is_string var -> ...
|
|
|
|
|
| "}"::[ ] -> ...
|
2020-02-24 14:36:26 +01:00
|
|
|
|
| "}"::rest -> error "syntax error: " rest
|
|
|
|
|
|
|
|
|
|
\end{verbatim}
|
|
|
|
|
|
2020-03-03 17:18:40 +01:00
|
|
|
|
Moreover, the pattern matching compiler emits a warning when a
|
2020-02-24 19:46:00 +01:00
|
|
|
|
pattern is not exhaustive or some patterns are shadowed by precedent ones.
|
2020-02-24 14:36:26 +01:00
|
|
|
|
|
2020-03-03 17:18:40 +01:00
|
|
|
|
\subsection{Symbolic execution}
|
|
|
|
|
\label{sec:orge2e0205}
|
|
|
|
|
|
|
|
|
|
\subsection{Translation validation}
|
|
|
|
|
\label{sec:orgbafe7bc}
|
|
|
|
|
Translators, such as translators and code generators, are huge pieces of
|
|
|
|
|
software usually consisting of multiple subsystem and
|
|
|
|
|
constructing an actual specification of a translator implementation for
|
|
|
|
|
formal validation is a very long task. Moreover, different
|
|
|
|
|
translators implement different algorithms, so the correctness proof of
|
|
|
|
|
a translator cannot be generalized and reused to prove another translator.
|
|
|
|
|
Translation validation is an alternative to the verification of
|
|
|
|
|
existing translators that consists of taking the source and the target
|
|
|
|
|
(compiled) program and proving \emph{a posteriori} their semantic equivalence.
|
|
|
|
|
|
|
|
|
|
\begin{itemize}
|
|
|
|
|
\item[{$\square$}] Techniques for translation validation
|
|
|
|
|
\item[{$\square$}] What does semantically equivalent mean
|
|
|
|
|
\item[{$\square$}] What happens when there is no semantic equivalence
|
|
|
|
|
\item[{$\square$}] Translation validation through symbolic execution
|
|
|
|
|
\end{itemize}
|
|
|
|
|
|
|
|
|
|
\subsection{Translation validation of the Pattern Matching Compiler}
|
|
|
|
|
\label{sec:org24ee133}
|
|
|
|
|
|
|
|
|
|
\begin{enumerate}
|
|
|
|
|
\item Source program
|
|
|
|
|
\label{sec:org8c21912}
|
|
|
|
|
The algorithm takes as its input a source program and translates it
|
|
|
|
|
into an algebraic data structure called \emph{constraint\_tree}.
|
|
|
|
|
|
|
|
|
|
\begin{verbatim}
|
|
|
|
|
type constraint_tree =
|
|
|
|
|
| Unreachable
|
|
|
|
|
| Failure
|
|
|
|
|
| Leaf of source_expr
|
|
|
|
|
| Guard of source_blackbox * constraint_tree * constraint_tree
|
|
|
|
|
| Node of accessor * (constructor * constraint_tree) list * constraint_tree
|
|
|
|
|
\end{verbatim}
|
|
|
|
|
|
|
|
|
|
Unreachable, Leaf of source\_expr and Failure are the terminals of the three.
|
|
|
|
|
We distinguish
|
|
|
|
|
\begin{itemize}
|
|
|
|
|
\item Unreachable: statically it is known that no value can go there
|
|
|
|
|
\item Failure: a value matching this part results in an error
|
|
|
|
|
\item Leaf: a value matching this part results into the evaluation of a
|
|
|
|
|
source blackbox of code
|
|
|
|
|
\end{itemize}
|
|
|
|
|
|
|
|
|
|
The algorithm doesn't support type-declaration-based analysis
|
|
|
|
|
to know the list of constructors at a given type.
|
|
|
|
|
Let's consider some trivial examples:
|
|
|
|
|
|
|
|
|
|
\begin{verbatim}
|
|
|
|
|
function true -> 1
|
|
|
|
|
\end{verbatim}
|
|
|
|
|
|
|
|
|
|
[ ] Converti a disegni
|
|
|
|
|
|
|
|
|
|
Is translated to
|
|
|
|
|
\begin{center}
|
|
|
|
|
\begin{tabular}{l}
|
|
|
|
|
Node ([(true, Leaf 1)], Failure)\\
|
|
|
|
|
\end{tabular}
|
|
|
|
|
\end{center}
|
|
|
|
|
while
|
|
|
|
|
\begin{verbatim}
|
|
|
|
|
function
|
|
|
|
|
true -> 1
|
|
|
|
|
| false -> 2
|
|
|
|
|
\end{verbatim}
|
|
|
|
|
will give
|
|
|
|
|
\begin{center}
|
|
|
|
|
\begin{tabular}{l}
|
|
|
|
|
Node ([(true, Leaf 1); (false, Leaf 2)])\\
|
|
|
|
|
\end{tabular}
|
|
|
|
|
\end{center}
|
|
|
|
|
|
|
|
|
|
It is possible to produce Unreachable examples by using
|
|
|
|
|
refutation clauses (a "dot" in the right-hand-side)
|
|
|
|
|
\begin{verbatim}
|
|
|
|
|
function
|
|
|
|
|
true -> 1
|
|
|
|
|
| false -> 2
|
|
|
|
|
| _ -> .
|
|
|
|
|
\end{verbatim}
|
|
|
|
|
that gets translated into
|
|
|
|
|
Node ([(true, Leaf 1); (false, Leaf 2)], Unreachable)
|
|
|
|
|
|
|
|
|
|
We trust this annotation, which is reasonable as the type-checker
|
|
|
|
|
verifies that it indeed holds.
|
|
|
|
|
|
|
|
|
|
Guard nodes of the tree are emitted whenever a guard is found. Guards
|
|
|
|
|
node contains a blackbox of code that is never evaluated and two
|
|
|
|
|
branches, one that is taken in case the guard evaluates to true and
|
|
|
|
|
the other one that contains the path taken when the guard evaluates to
|
|
|
|
|
true.
|
|
|
|
|
|
|
|
|
|
[ ] Finisci con Node
|
|
|
|
|
[ ] Spiega del fallback
|
|
|
|
|
[ ] rivedi compilazione per tenere in considerazione il tuo albero invece che le lambda
|
|
|
|
|
[ ] Specifica che stesso algoritmo usato per compilare a lambda, piu` optimizations
|
|
|
|
|
|
|
|
|
|
The source code of a pattern matching function in has the
|
|
|
|
|
following form:
|
2020-02-24 14:36:26 +01:00
|
|
|
|
|
2020-02-24 19:46:00 +01:00
|
|
|
|
\begin{center}
|
|
|
|
|
\begin{tabular}{l}
|
|
|
|
|
match variable with\\
|
|
|
|
|
\(\vert{}\) pattern₁ -> expr₁\\
|
|
|
|
|
\(\vert{}\) pattern₂ when guard -> expr₂\\
|
|
|
|
|
\(\vert{}\) pattern₃ as var -> expr₃\\
|
|
|
|
|
⋮\\
|
|
|
|
|
\(\vert{}\) pₙ -> exprₙ\\
|
|
|
|
|
\end{tabular}
|
|
|
|
|
\end{center}
|
2020-02-24 14:36:26 +01:00
|
|
|
|
|
2020-03-03 17:18:40 +01:00
|
|
|
|
and can include any expression that is legal for the compiler,
|
|
|
|
|
such as "when" conditions and assignments. Patterns could or could not
|
|
|
|
|
be exhaustive.
|
|
|
|
|
|
|
|
|
|
Pattern matching code could also be written using the more compact form:
|
|
|
|
|
\begin{center}
|
|
|
|
|
\begin{tabular}{l}
|
|
|
|
|
function\\
|
|
|
|
|
\(\vert{}\) pattern₁ -> expr₁\\
|
|
|
|
|
\(\vert{}\) pattern₂ when guard -> expr₂\\
|
|
|
|
|
\(\vert{}\) pattern₃ as var -> expr₃\\
|
|
|
|
|
⋮\\
|
|
|
|
|
\(\vert{}\) pₙ -> exprₙ\\
|
|
|
|
|
\end{tabular}
|
|
|
|
|
\end{center}
|
2020-02-24 14:36:26 +01:00
|
|
|
|
|
|
|
|
|
|
2020-03-03 17:18:40 +01:00
|
|
|
|
This BNF grammar describes formally the grammar of the source program:
|
2020-02-24 14:36:26 +01:00
|
|
|
|
|
2020-03-03 17:18:40 +01:00
|
|
|
|
\begin{verbatim}
|
|
|
|
|
start ::= "match" id "with" patterns | "function" patterns
|
|
|
|
|
patterns ::= (pattern0|pattern1) pattern1+
|
|
|
|
|
;; pattern0 and pattern1 are needed to distinguish the first case in which
|
|
|
|
|
;; we can avoid writing the optional vertical line
|
|
|
|
|
pattern0 ::= clause
|
|
|
|
|
pattern1 ::= "|" clause
|
|
|
|
|
clause ::= lexpr "->" rexpr
|
|
|
|
|
|
|
|
|
|
lexpr ::= rule (ε|condition)
|
|
|
|
|
rexpr ::= _code ;; arbitrary code
|
|
|
|
|
|
|
|
|
|
rule ::= wildcard|variable|constructor_pattern|or_pattern ;;
|
|
|
|
|
|
|
|
|
|
;; rules
|
|
|
|
|
wildcard ::= "_"
|
|
|
|
|
variable ::= identifier
|
|
|
|
|
constructor_pattern ::= constructor (rule|ε) (assignment|ε)
|
|
|
|
|
|
|
|
|
|
constructor ::= int|float|char|string|bool
|
|
|
|
|
|unit|record|exn|objects|ref
|
|
|
|
|
|list|tuple|array
|
|
|
|
|
|variant|parameterized_variant ;; data types
|
|
|
|
|
|
|
|
|
|
or_pattern ::= wildcard|variable|constructor_pattern ("|" wildcard|variable|constructor_pattern)+
|
|
|
|
|
|
|
|
|
|
condition ::= "when" bexpr
|
|
|
|
|
assignment ::= "as" id
|
|
|
|
|
bexpr ::= _code ;; arbitrary code
|
2020-02-24 19:46:00 +01:00
|
|
|
|
\end{verbatim}
|
2020-02-24 14:36:26 +01:00
|
|
|
|
|
2020-03-02 14:46:37 +01:00
|
|
|
|
\begin{comment}
|
2020-03-03 17:18:40 +01:00
|
|
|
|
Check that it is still this
|
|
|
|
|
\end{comment}
|
2020-02-24 19:46:00 +01:00
|
|
|
|
|
2020-03-03 17:18:40 +01:00
|
|
|
|
Patterns are of the form
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\begin{center}
|
|
|
|
|
\begin{tabular}{ll}
|
2020-02-24 19:46:00 +01:00
|
|
|
|
pattern & type of pattern\\
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\hline
|
|
|
|
|
\_ & wildcard\\
|
|
|
|
|
x & variable\\
|
|
|
|
|
c(p₁,p₂,\ldots{},pₙ) & constructor pattern\\
|
|
|
|
|
(p₁\(\vert{}\) p₂) & or-pattern\\
|
|
|
|
|
\end{tabular}
|
|
|
|
|
\end{center}
|
|
|
|
|
|
2020-03-03 17:18:40 +01:00
|
|
|
|
During compilation by the translators expressions are compiled into
|
|
|
|
|
lambda code and are referred as lambda code actions lᵢ.
|
2020-02-24 14:36:26 +01:00
|
|
|
|
|
2020-03-03 17:18:40 +01:00
|
|
|
|
The entire pattern matching code is represented as a clause matrix
|
2020-02-24 14:36:26 +01:00
|
|
|
|
that associates rows of patterns (p\(_{\text{i,1}}\), p\(_{\text{i,2}}\), \ldots{}, p\(_{\text{i,n}}\)) to
|
|
|
|
|
lambda code action lⁱ
|
|
|
|
|
\begin{equation*}
|
|
|
|
|
(P → L) =
|
|
|
|
|
\begin{pmatrix}
|
|
|
|
|
p_{1,1} & p_{1,2} & \cdots & p_{1,n} & → l₁ \\
|
|
|
|
|
p_{2,1} & p_{2,2} & \cdots & p_{2,n} & → l₂ \\
|
|
|
|
|
\vdots & \vdots & \ddots & \vdots & → \vdots \\
|
|
|
|
|
p_{m,1} & p_{m,2} & \cdots & p_{m,n} & → lₘ
|
|
|
|
|
\end{pmatrix}
|
|
|
|
|
\end{equation*}
|
|
|
|
|
|
|
|
|
|
The pattern \emph{p} matches a value \emph{v}, written as p ≼ v, when one of the
|
|
|
|
|
following rules apply
|
|
|
|
|
|
|
|
|
|
\begin{center}
|
|
|
|
|
\begin{tabular}{llll}
|
|
|
|
|
\hline
|
|
|
|
|
\_ & ≼ & v & ∀v\\
|
|
|
|
|
x & ≼ & v & ∀v\\
|
|
|
|
|
(p₁ \(\vert{}\)$\backslash$ p₂) & ≼ & v & iff p₁ ≼ v or p₂ ≼ v\\
|
|
|
|
|
c(p₁, p₂, \ldots{}, pₐ) & ≼ & c(v₁, v₂, \ldots{}, vₐ) & iff (p₁, p₂, \ldots{}, pₐ) ≼ (v₁, v₂, \ldots{}, vₐ)\\
|
|
|
|
|
(p₁, p₂, \ldots{}, pₐ) & ≼ & (v₁, v₂, \ldots{}, vₐ) & iff pᵢ ≼ vᵢ ∀i ∈ [1..a]\\
|
|
|
|
|
\hline
|
|
|
|
|
\end{tabular}
|
|
|
|
|
\end{center}
|
|
|
|
|
|
2020-02-24 19:46:00 +01:00
|
|
|
|
When a value \emph{v} matches pattern \emph{p} we say that \emph{v} is an \emph{instance} of \emph{p}.
|
2020-02-24 14:36:26 +01:00
|
|
|
|
|
2020-02-24 19:46:00 +01:00
|
|
|
|
Considering the pattern matrix P we say that the value vector
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\(\vec{v}\) = (v₁, v₂, \ldots{}, vᵢ) matches the line number i in P if and only if the following two
|
|
|
|
|
conditions are satisfied:
|
|
|
|
|
\begin{itemize}
|
|
|
|
|
\item p\(_{\text{i,1}}\), p\(_{\text{i,2}}\), \(\cdots{}\), p\(_{\text{i,n}}\) ≼ (v₁, v₂, \ldots{}, vᵢ)
|
|
|
|
|
\item ∀j < i p\(_{\text{j,1}}\), p\(_{\text{j,2}}\), \(\cdots{}\), p\(_{\text{j,n}}\) ⋠ (v₁, v₂, \ldots{}, vᵢ)
|
|
|
|
|
\end{itemize}
|
|
|
|
|
|
|
|
|
|
We can define the following three relations with respect to patterns:
|
|
|
|
|
\begin{itemize}
|
|
|
|
|
\item Patter p is less precise than pattern q, written p ≼ q, when all
|
|
|
|
|
instances of q are instances of p
|
|
|
|
|
\item Pattern p and q are equivalent, written p ≡ q, when their instances
|
|
|
|
|
are the same
|
|
|
|
|
\item Patterns p and q are compatible when they share a common instance
|
|
|
|
|
\end{itemize}
|
|
|
|
|
|
2020-02-24 19:46:00 +01:00
|
|
|
|
\begin{enumerate}
|
|
|
|
|
\item Initial state of the compilation
|
2020-03-03 17:18:40 +01:00
|
|
|
|
\label{sec:org9a7b624}
|
2020-02-24 14:36:26 +01:00
|
|
|
|
|
|
|
|
|
Given a source of the following form:
|
|
|
|
|
|
2020-02-24 19:46:00 +01:00
|
|
|
|
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\begin{center}
|
|
|
|
|
\begin{tabular}{l}
|
2020-02-24 19:46:00 +01:00
|
|
|
|
match variable with\\
|
|
|
|
|
\(\vert{}\) pattern₁ -> e₁\\
|
|
|
|
|
\(\vert{}\) pattern₂ -> e₂\\
|
|
|
|
|
⋮\\
|
|
|
|
|
\(\vert{}\) pₘ -> eₘ\\
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\end{tabular}
|
|
|
|
|
\end{center}
|
|
|
|
|
|
2020-02-24 19:46:00 +01:00
|
|
|
|
the initial input of the algorithm C consists of a vector of variables
|
|
|
|
|
\(\vec{x}\) = (x₁, x₂, \ldots{}, xₙ) of size \emph{n} where \emph{n} is the arity of
|
|
|
|
|
the type of \emph{x} and a clause matrix P → L of width n and height m.
|
|
|
|
|
That is:
|
2020-02-24 14:36:26 +01:00
|
|
|
|
|
|
|
|
|
\begin{equation*}
|
2020-02-24 19:46:00 +01:00
|
|
|
|
C((\vec{x} = (x₁, x₂, ..., xₙ),
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\begin{pmatrix}
|
|
|
|
|
p_{1,1} & p_{1,2} & \cdots & p_{1,n} → l₁ \\
|
|
|
|
|
p_{2,1} & p_{2,2} & \cdots & p_{2,n} → l₂ \\
|
2020-02-24 19:46:00 +01:00
|
|
|
|
\vdots & \vdots & \ddots & \vdots → \vdots \\
|
|
|
|
|
p_{m,1} & p_{m,2} & \cdots & p_{m,n} → lₘ)
|
|
|
|
|
\end{pmatrix}
|
|
|
|
|
\end{equation*}
|
|
|
|
|
|
|
|
|
|
The base case C₀ of the algorithm is the case in which the \(\vec{x}\)
|
|
|
|
|
is empty, that is \(\vec{x}\) = (), then the result of the compilation
|
|
|
|
|
C₀ is l₁
|
|
|
|
|
\begin{equation*}
|
|
|
|
|
C₀((),
|
|
|
|
|
\begin{pmatrix}
|
|
|
|
|
→ l₁ \\
|
|
|
|
|
→ l₂ \\
|
|
|
|
|
→ \vdots \\
|
|
|
|
|
→ lₘ
|
|
|
|
|
\end{pmatrix})
|
|
|
|
|
) = l₁
|
|
|
|
|
\end{equation*}
|
|
|
|
|
|
|
|
|
|
When \(\vec{x}\) ≠ () then the compilation advances using one of the
|
|
|
|
|
following four rules:
|
|
|
|
|
|
|
|
|
|
\begin{enumerate}
|
|
|
|
|
\item Variable rule: if all patterns of the first column of P are wildcard patterns or
|
|
|
|
|
bind the value to a variable, then
|
|
|
|
|
|
|
|
|
|
\begin{equation*}
|
|
|
|
|
C(\vec{x}, P → L) = C((x₂, x₃, ..., xₙ), P' → L')
|
|
|
|
|
\end{equation*}
|
|
|
|
|
where
|
|
|
|
|
\begin{equation*}
|
|
|
|
|
\begin{pmatrix}
|
|
|
|
|
p_{1,2} & \cdots & p_{1,n} & → & (let & y₁ & x₁) & l₁ \\
|
|
|
|
|
p_{2,2} & \cdots & p_{2,n} & → & (let & y₂ & x₁) & l₂ \\
|
|
|
|
|
\vdots & \ddots & \vdots & → & \vdots & \vdots & \vdots & \vdots \\
|
|
|
|
|
p_{m,2} & \cdots & p_{m,n} & → & (let & yₘ & x₁) & lₘ
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\end{pmatrix}
|
|
|
|
|
\end{equation*}
|
2020-02-24 19:46:00 +01:00
|
|
|
|
|
|
|
|
|
That means in every lambda action lᵢ there is a binding of x₁ to the
|
|
|
|
|
variable that appears on the pattern \$p\(_{\text{i,1}}\). Bindings are omitted
|
|
|
|
|
for wildcard patterns and the lambda action lᵢ remains unchanged.
|
|
|
|
|
|
|
|
|
|
\item Constructor rule: if all patterns in the first column of P are
|
|
|
|
|
constructors patterns of the form k(q₁, q₂, \ldots{}, qₙ) we define a
|
|
|
|
|
new matrix, the specialized clause matrix S, by applying the
|
|
|
|
|
following transformation on every row \emph{p}:
|
|
|
|
|
\begin{lstlisting}[mathescape,columns=fullflexible,basicstyle=\fontfamily{lmvtt}\selectfont,]
|
|
|
|
|
for every c ∈ Set of constructors
|
|
|
|
|
for i ← 1 .. m
|
|
|
|
|
let kᵢ ← constructor_of($p_{i,1}$)
|
|
|
|
|
if kᵢ = c then
|
|
|
|
|
p ← $q_{i,1}$, $q_{i,2}$, ..., $q_{i,n'}$, $p_{i,2}$, $p_{i,3}$, ..., $p_{i, n}$
|
|
|
|
|
\end{lstlisting}
|
|
|
|
|
Patterns of the form \(q_{i,j}\) matches on the values of the
|
|
|
|
|
constructor and we define new fresh variables y₁, y₂, \ldots{}, yₐ so
|
|
|
|
|
that the lambda action lᵢ becomes
|
|
|
|
|
\end{enumerate}
|
|
|
|
|
|
|
|
|
|
\begin{lstlisting}[mathescape,columns=fullflexible,basicstyle=\fontfamily{lmvtt}\selectfont,]
|
|
|
|
|
(let (y₁ (field 0 x₁))
|
|
|
|
|
(y₂ (field 1 x₁))
|
|
|
|
|
...
|
|
|
|
|
(yₐ (field (a-1) x₁))
|
|
|
|
|
lᵢ)
|
|
|
|
|
\end{lstlisting}
|
|
|
|
|
|
|
|
|
|
and the result of the compilation for the set of constructors
|
|
|
|
|
\{c₁, c₂, \ldots{}, cₖ\} is:
|
|
|
|
|
|
|
|
|
|
\begin{lstlisting}[mathescape,columns=fullflexible,basicstyle=\fontfamily{lmvtt}\selectfont,]
|
|
|
|
|
switch x₁ with
|
|
|
|
|
case c₁: l₁
|
|
|
|
|
case c₂: l₂
|
|
|
|
|
...
|
|
|
|
|
case cₖ: lₖ
|
|
|
|
|
default: exit
|
|
|
|
|
\end{lstlisting}
|
|
|
|
|
|
|
|
|
|
\begin{enumerate}
|
|
|
|
|
\item Orpat rule: there are various strategies for dealing with
|
|
|
|
|
or-patterns. The most naive one is to split the or-patterns.
|
|
|
|
|
For example a row p containing an or-pattern:
|
|
|
|
|
\begin{equation*}
|
|
|
|
|
(p_{i,1}|q_{i,1}|r_{i,1}), p_{i,2}, ..., p_{i,m} → lᵢ
|
|
|
|
|
\end{equation*}
|
|
|
|
|
results in three rows added to the clause matrix
|
|
|
|
|
\begin{equation*}
|
|
|
|
|
p_{i,1}, p_{i,2}, ..., p_{i,m} → lᵢ \\
|
|
|
|
|
\end{equation*}
|
|
|
|
|
\begin{equation*}
|
|
|
|
|
q_{i,1}, p_{i,2}, ..., p_{i,m} → lᵢ \\
|
|
|
|
|
\end{equation*}
|
|
|
|
|
\begin{equation*}
|
|
|
|
|
r_{i,1}, p_{i,2}, ..., p_{i,m} → lᵢ
|
|
|
|
|
\end{equation*}
|
|
|
|
|
\item Mixture rule:
|
|
|
|
|
When none of the previous rules apply the clause matrix P → L is
|
|
|
|
|
splitted into two clause matrices, the first P₁ → L₁ that is the
|
|
|
|
|
largest prefix matrix for which one of the three previous rules
|
|
|
|
|
apply, and P₂ → L₂ containing the remaining rows. The algorithm is
|
|
|
|
|
applied to both matrices.
|
|
|
|
|
\end{enumerate}
|
|
|
|
|
|
|
|
|
|
\begin{comment}
|
|
|
|
|
#+BEGIN_COMMENT
|
|
|
|
|
CITE paper?
|
|
|
|
|
#+END_COMMENT’
|
|
|
|
|
\end{comment}
|
|
|
|
|
\end{enumerate}
|
|
|
|
|
\end{enumerate}
|
2020-02-24 14:36:26 +01:00
|
|
|
|
\end{document}
|