python-peps/pep-0617/index.html


<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <meta name="color-scheme" content="light dark">
    <title>PEP 617 – New PEG parser for CPython | peps.python.org</title>
    <link rel="shortcut icon" href="../_static/py.png">
    <link rel="canonical" href="https://peps.python.org/pep-0617/">
    <link rel="stylesheet" href="../_static/style.css" type="text/css">
    <link rel="stylesheet" href="../_static/mq.css" type="text/css">
    <link rel="stylesheet" href="../_static/pygments.css" type="text/css" media="(prefers-color-scheme: light)" id="pyg-light">
    <link rel="stylesheet" href="../_static/pygments_dark.css" type="text/css" media="(prefers-color-scheme: dark)" id="pyg-dark">
    <link rel="alternate" type="application/rss+xml" title="Latest PEPs" href="https://peps.python.org/peps.rss">
    <meta property="og:title" content='PEP 617 – New PEG parser for CPython | peps.python.org'>
    <meta property="og:description" content="Python Enhancement Proposals (PEPs)">
    <meta property="og:type" content="website">
    <meta property="og:url" content="https://peps.python.org/pep-0617/">
    <meta property="og:site_name" content="Python Enhancement Proposals (PEPs)">
    <meta property="og:image" content="https://peps.python.org/_static/og-image.png">
    <meta property="og:image:alt" content="Python PEPs">
    <meta property="og:image:width" content="200">
    <meta property="og:image:height" content="200">
    <meta name="description" content="Python Enhancement Proposals (PEPs)">
    <meta name="theme-color" content="#3776ab">
</head>
<body>

<svg xmlns="http://www.w3.org/2000/svg" style="display: none;">
  <symbol id="svg-sun-half" viewBox="0 0 24 24" pointer-events="all">
    <title>Following system colour scheme</title>
    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none"
         stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
      <circle cx="12" cy="12" r="9"></circle>
      <path d="M12 3v18m0-12l4.65-4.65M12 14.3l7.37-7.37M12 19.6l8.85-8.85"></path>
    </svg>
  </symbol>
  <symbol id="svg-moon" viewBox="0 0 24 24" pointer-events="all">
    <title>Selected dark colour scheme</title>
    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none"
         stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
      <path stroke="none" d="M0 0h24v24H0z" fill="none"></path>
      <path d="M12 3c.132 0 .263 0 .393 0a7.5 7.5 0 0 0 7.92 12.446a9 9 0 1 1 -8.313 -12.454z"></path>
    </svg>
  </symbol>
  <symbol id="svg-sun" viewBox="0 0 24 24" pointer-events="all">
    <title>Selected light colour scheme</title>
    <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 24 24" fill="none"
         stroke="currentColor" stroke-width="2" stroke-linecap="round" stroke-linejoin="round">
      <circle cx="12" cy="12" r="5"></circle>
      <line x1="12" y1="1" x2="12" y2="3"></line>
      <line x1="12" y1="21" x2="12" y2="23"></line>
      <line x1="4.22" y1="4.22" x2="5.64" y2="5.64"></line>
      <line x1="18.36" y1="18.36" x2="19.78" y2="19.78"></line>
      <line x1="1" y1="12" x2="3" y2="12"></line>
      <line x1="21" y1="12" x2="23" y2="12"></line>
      <line x1="4.22" y1="19.78" x2="5.64" y2="18.36"></line>
      <line x1="18.36" y1="5.64" x2="19.78" y2="4.22"></line>
    </svg>
  </symbol>
</svg>
    <script>

        document.documentElement.dataset.colour_scheme = localStorage.getItem("colour_scheme") || "auto"
    </script>
    <section id="pep-page-section">
        <header>
            <h1>Python Enhancement Proposals</h1>
            <ul class="breadcrumbs">
                <li><a href="https://www.python.org/" title="The Python Programming Language">Python</a> &raquo; </li>
                <li><a href="../pep-0000/">PEP Index</a> &raquo; </li>
                <li>PEP 617</li>
            </ul>
            <button id="colour-scheme-cycler" onClick="setColourScheme(nextColourScheme())">
                <svg aria-hidden="true" class="colour-scheme-icon-when-auto"><use href="#svg-sun-half"></use></svg>
                <svg aria-hidden="true" class="colour-scheme-icon-when-dark"><use href="#svg-moon"></use></svg>
                <svg aria-hidden="true" class="colour-scheme-icon-when-light"><use href="#svg-sun"></use></svg>
                <span class="visually-hidden">Toggle light / dark / auto colour theme</span>
            </button>
        </header>
        <article>
            <section id="pep-content">
<h1 class="page-title">PEP 617 – New PEG parser for CPython</h1>
<dl class="rfc2822 field-list simple">
<dt class="field-odd">Author<span class="colon">:</span></dt>
<dd class="field-odd">Guido van Rossum &lt;guido&#32;&#97;t&#32;python.org&gt;,
Pablo Galindo &lt;pablogsal&#32;&#97;t&#32;python.org&gt;,
Lysandros Nikolaou &lt;lisandrosnik&#32;&#97;t&#32;gmail.com&gt;</dd>
<dt class="field-even">Discussions-To<span class="colon">:</span></dt>
<dd class="field-even"><a class="reference external" href="https://mail.python.org/archives/list/python-dev&#64;python.org/">Python-Dev list</a></dd>
<dt class="field-odd">Status<span class="colon">:</span></dt>
<dd class="field-odd"><abbr title="Accepted and implementation complete, or no longer active">Final</abbr></dd>
<dt class="field-even">Type<span class="colon">:</span></dt>
<dd class="field-even"><abbr title="Normative PEP with a new feature for Python, implementation change for CPython or interoperability standard for the ecosystem">Standards Track</abbr></dd>
<dt class="field-odd">Created<span class="colon">:</span></dt>
<dd class="field-odd">24-Mar-2020</dd>
<dt class="field-even">Python-Version<span class="colon">:</span></dt>
<dd class="field-even">3.9</dd>
<dt class="field-odd">Post-History<span class="colon">:</span></dt>
<dd class="field-odd">02-Apr-2020</dd>
</dl>
<hr class="docutils" />
<section id="contents">
<details><summary>Table of Contents</summary><ul class="simple">
<li><a class="reference internal" href="#overview">Overview</a></li>
<li><a class="reference internal" href="#background-on-ll-1-parsers">Background on LL(1) parsers</a></li>
<li><a class="reference internal" href="#background-on-peg-parsers">Background on PEG parsers</a></li>
<li><a class="reference internal" href="#rationale">Rationale</a><ul>
<li><a class="reference internal" href="#some-rules-are-not-actually-ll-1">Some rules are not actually LL(1)</a></li>
<li><a class="reference internal" href="#complicated-ast-parsing">Complicated AST parsing</a></li>
<li><a class="reference internal" href="#lack-of-left-recursion">Lack of left recursion</a></li>
<li><a class="reference internal" href="#intermediate-parse-tree">Intermediate parse tree</a></li>
</ul>
</li>
<li><a class="reference internal" href="#the-new-proposed-peg-parser">The new proposed PEG parser</a><ul>
<li><a class="reference internal" href="#left-recursion">Left recursion</a></li>
<li><a class="reference internal" href="#syntax">Syntax</a><ul>
<li><a class="reference internal" href="#grammar-expressions">Grammar Expressions</a><ul>
<li><a class="reference internal" href="#comment"><code class="docutils literal notranslate"><span class="pre">#</span> <span class="pre">comment</span></code></a></li>
<li><a class="reference internal" href="#e1-e2"><code class="docutils literal notranslate"><span class="pre">e1</span> <span class="pre">e2</span></code></a></li>
<li><a class="reference internal" href="#e1-e2-1"><code class="docutils literal notranslate"><span class="pre">e1</span> <span class="pre">|</span> <span class="pre">e2</span></code></a></li>
<li><a class="reference internal" href="#e"><code class="docutils literal notranslate"><span class="pre">(</span> <span class="pre">e</span> <span class="pre">)</span></code></a></li>
<li><a class="reference internal" href="#e-or-e"><code class="docutils literal notranslate"><span class="pre">[</span> <span class="pre">e</span> <span class="pre">]</span> <span class="pre">or</span> <span class="pre">e?</span></code></a></li>
<li><a class="reference internal" href="#e-1"><code class="docutils literal notranslate"><span class="pre">e*</span></code></a></li>
<li><a class="reference internal" href="#e-2"><code class="docutils literal notranslate"><span class="pre">e+</span></code></a></li>
<li><a class="reference internal" href="#s-e"><code class="docutils literal notranslate"><span class="pre">s.e+</span></code></a></li>
<li><a class="reference internal" href="#e-3"><code class="docutils literal notranslate"><span class="pre">&amp;e</span></code></a></li>
<li><a class="reference internal" href="#e-4"><code class="docutils literal notranslate"><span class="pre">!e</span></code></a></li>
<li><a class="reference internal" href="#e-5"><code class="docutils literal notranslate"><span class="pre">~</span></code></a></li>
</ul>
</li>
<li><a class="reference internal" href="#variables-in-the-grammar">Variables in the Grammar</a></li>
</ul>
</li>
<li><a class="reference internal" href="#grammar-actions">Grammar actions</a></li>
</ul>
</li>
<li><a class="reference internal" href="#migration-plan">Migration plan</a></li>
<li><a class="reference internal" href="#performance-and-validation">Performance and validation</a><ul>
<li><a class="reference internal" href="#validation">Validation</a></li>
<li><a class="reference internal" href="#performance">Performance</a></li>
</ul>
</li>
<li><a class="reference internal" href="#rejected-alternatives">Rejected Alternatives</a></li>
<li><a class="reference internal" href="#references">References</a></li>
<li><a class="reference internal" href="#copyright">Copyright</a></li>
</ul>
</details></section>
<div class="pep-banner canonical-doc sticky-banner admonition important">
<p class="admonition-title">Important</p>
<p>This PEP is a historical document. The up-to-date, canonical documentation can now be found at <a class="reference external" href="https://docs.python.org/3/reference/grammar.html#full-grammar-specification" title="(in Python v3.13)"><span>Full Grammar specification</span></a>.</p>
<p class="close-button">×</p>
<p>See <a class="pep reference internal" href="../pep-0001/" title="PEP 1 – PEP Purpose and Guidelines">PEP 1</a> for how to propose changes.</p>
</div>
<section id="overview">
<h2><a class="toc-backref" href="#overview" role="doc-backlink">Overview</a></h2>
<p>This PEP proposes replacing the current LL(1)-based parser of CPython
with a new PEG-based parser. This new parser would allow the elimination of multiple
“hacks” that exist in the current grammar to circumvent the LL(1)-limitation.
It would substantially reduce the maintenance costs in some areas related to the
compiling pipeline such as the grammar, the parser and the AST generation. The new PEG
parser will also lift the LL(1) restriction on the current Python grammar.</p>
</section>
<section id="background-on-ll-1-parsers">
<h2><a class="toc-backref" href="#background-on-ll-1-parsers" role="doc-backlink">Background on LL(1) parsers</a></h2>
<p>The current Python grammar is an LL(1)-based grammar. A grammar can be said to be
LL(1) if it can be parsed by an LL(1) parser, which in turn is defined as a
top-down parser that parses the input from left to right, performing leftmost
derivation of the sentence, with just one token of lookahead.
The traditional approach to constructing or generating an LL(1) parser is to
produce a <em>parse table</em> which encodes the possible transitions between all possible
states of the parser. These tables are normally constructed from the <em>first sets</em>
and the <em>follow sets</em> of the grammar:</p>
<ul>
<li>Given a rule, the <em>first set</em> is the collection of all terminals that can occur
first in a full derivation of that rule. Intuitively, this helps the parser decide
among the alternatives in a rule. For
instance, given the rule:<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule</span><span class="o">:</span> <span class="nc">A</span> <span class="o">|</span> <span class="nc">B</span>
</pre></div>
</div>
<p>if only <code class="docutils literal notranslate"><span class="pre">A</span></code> can start with the terminal <em>a</em> and only <code class="docutils literal notranslate"><span class="pre">B</span></code> can start with the
terminal <em>b</em> and the parser sees the token <em>b</em> when parsing this rule, it knows
that it needs to follow the non-terminal <code class="docutils literal notranslate"><span class="pre">B</span></code>.</p>
</li>
<li>An extension to this simple idea is needed when a rule may expand to the empty string.
Given a rule, the <em>follow set</em> is the collection of terminals that can appear
immediately to the right of that rule in a partial derivation. Intuitively, this
solves the problem of the empty alternative. For instance,
given this rule:<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule</span><span class="o">:</span> <span class="nc">A</span> <span class="s1">&#39;b&#39;</span>
</pre></div>
</div>
<p>if the parser has the token <em>b</em> and the non-terminal <code class="docutils literal notranslate"><span class="pre">A</span></code> can only start
with the token <em>a</em>, then the parser can tell that this is an invalid program.
But if <code class="docutils literal notranslate"><span class="pre">A</span></code> could expand to the empty string (called an ε-production),
then the parser would recognise a valid empty <code class="docutils literal notranslate"><span class="pre">A</span></code>,
since the next token <em>b</em> is in the <em>follow set</em>  of <code class="docutils literal notranslate"><span class="pre">A</span></code>.</p>
</li>
</ul>
<p>The current Python grammar does not contain ε-productions, so the <em>follow sets</em> are not
needed when creating the parse tables. Currently, in CPython, a parser generator
program reads the grammar and produces a parsing table representing a set of
deterministic finite automata (DFA) that can be included in a C program, the
parser. The parser is a pushdown automaton that uses this data to produce a Concrete
Syntax Tree (CST) sometimes known directly as a “parse tree”. In this process, the
<em>first sets</em> are used indirectly when generating the DFAs.</p>
<p>LL(1) parsers and grammars are usually efficient and simple to implement
and generate. However, it is not possible, under the LL(1) restriction,
to express certain common constructs in a way natural to the language
designer and the reader. This includes some in the Python language.</p>
<p>As LL(1) parsers can only look one token ahead to distinguish
possibilities, some rules in the grammar may be ambiguous. For instance the rule:</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule</span><span class="o">:</span> <span class="nc">A</span> <span class="o">|</span> <span class="nc">B</span>
</pre></div>
</div>
<p>is ambiguous if the <em>first sets</em> of both <code class="docutils literal notranslate"><span class="pre">A</span></code> and <code class="docutils literal notranslate"><span class="pre">B</span></code> have some elements in
common. When the parser sees a token in the input
program that both <em>A</em> and <em>B</em> can start with, it is impossible for it to deduce
which option to expand, as no further token of the program can be examined to
disambiguate.
The rule may be transformed to equivalent LL(1) rules, but then it may
be harder for a human reader to grasp its meaning.
Examples later in this document show that the current LL(1)-based
grammar suffers a lot from this scenario.</p>
<p>Another broad class of rules precluded by LL(1) is left-recursive rules.
A rule is left-recursive if it can derive to a
sentential form with itself as the leftmost symbol. For instance this rule:</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule</span><span class="o">:</span> <span class="nc">rule</span> <span class="s1">&#39;a&#39;</span>
</pre></div>
</div>
<p>is left-recursive because the rule can be expanded to an expression that starts
with itself. As will be described later, left-recursion is the natural way to
express certain desired language properties directly in the grammar.</p>
</section>
<section id="background-on-peg-parsers">
<h2><a class="toc-backref" href="#background-on-peg-parsers" role="doc-backlink">Background on PEG parsers</a></h2>
<p>A PEG (Parsing Expression Grammar) grammar differs from a context-free grammar
(like the current one) in the fact that the way it is written more closely
reflects how the parser will operate when parsing it. The fundamental technical
difference is that the choice operator is ordered. This means that when writing:</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule</span><span class="o">:</span> <span class="nc">A</span> <span class="o">|</span> <span class="nc">B</span> <span class="o">|</span> <span class="nc">C</span>
</pre></div>
</div>
<p>a context-free-grammar parser (like an LL(1) parser) will generate constructions
that given an input string will <em>deduce</em> which alternative (<code class="docutils literal notranslate"><span class="pre">A</span></code>, <code class="docutils literal notranslate"><span class="pre">B</span></code> or <code class="docutils literal notranslate"><span class="pre">C</span></code>)
must be expanded, while a PEG parser will check if the first alternative succeeds
and only if it fails, will it continue with the second or the third one in the
order in which they are written. This makes the choice operator not commutative.</p>
<p>Unlike LL(1) parsers, PEG-based parsers cannot be ambiguous: if a string parses,
it has exactly one valid parse tree. This means that a PEG-based parser cannot
suffer from the ambiguity problems described in the previous section.</p>
<p>PEG parsers are usually constructed as a recursive descent parser in which every
rule in the grammar corresponds to a function in the program implementing the
parser and the parsing expression (the “expansion” or “definition” of the rule)
represents the “code” in said function. Each parsing function conceptually takes
an input string as its argument, and yields one of the following results:</p>
<ul class="simple">
<li>A “success” result. This result indicates that the expression can be parsed by
that rule and the function may optionally move forward or consume one or more
characters of the input string supplied to it.</li>
<li>A “failure” result, in which case no input is consumed.</li>
</ul>
<p>Notice that “failure” results do not imply that the program is incorrect or a
parsing failure because as the choice operator is ordered, a “failure” result
merely indicates “try the following option”. A direct implementation of a PEG
parser as a recursive descent parser will present exponential time performance in
the worst case as compared with LL(1) parsers, because PEG parsers have infinite lookahead
(this means that they can consider an arbitrary number of tokens before deciding
for a rule). Usually, PEG parsers avoid this exponential time complexity with a
technique called “packrat parsing” <a class="footnote-reference brackets" href="#id10" id="id1">[1]</a> which not only loads the entire
program in memory before parsing it but also allows the parser to backtrack
arbitrarily. This is made efficient by memoizing the rules already matched for
each position. The cost of the memoization cache is that the parser will naturally
use more memory than a simple LL(1) parser, which normally are table-based. We
will explain later in this document why we consider this cost acceptable.</p>
</section>
<section id="rationale">
<h2><a class="toc-backref" href="#rationale" role="doc-backlink">Rationale</a></h2>
<p>In this section, we describe a list of problems that are present in the current parser
machinery in CPython that motivates the need for a new parser.</p>
<section id="some-rules-are-not-actually-ll-1">
<h3><a class="toc-backref" href="#some-rules-are-not-actually-ll-1" role="doc-backlink">Some rules are not actually LL(1)</a></h3>
<p>Although the Python grammar is technically an LL(1) grammar (because it is parsed by
an LL(1) parser) several rules are not LL(1) and several workarounds are
implemented in the grammar and in other parts of CPython to deal with this. For
example, consider the rule for assignment expressions:</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">namedexpr_test</span><span class="o">:</span> <span class="p">[</span><span class="s">NAME &#39;:=&#39;</span><span class="p">]</span> <span class="nc">test</span>
</pre></div>
</div>
<p>This simple rule is not compatible with the Python grammar as <em>NAME</em> is among the
elements of the <em>first set</em> of the rule <em>test</em>. To work around this limitation the
actual rule that appears in the current grammar is:</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">namedexpr_test</span><span class="o">:</span> <span class="nc">test</span> <span class="p">[</span><span class="s">&#39;:=&#39; test</span><span class="p">]</span>
</pre></div>
</div>
<p>Which is a much broader rule than the previous one allowing constructs like <code class="docutils literal notranslate"><span class="pre">[x</span>
<span class="pre">for</span> <span class="pre">x</span> <span class="pre">in</span> <span class="pre">y]</span> <span class="pre">:=</span> <span class="pre">[1,2,3]</span></code>. The way the rule is limited to its desired form is by
disallowing these unwanted constructions when transforming the parse tree to the
abstract syntax tree. This is not only inelegant but a considerable maintenance
burden as it forces the AST creation routines and the compiler into a situation in
which they need to know how to separate valid programs from invalid programs,
which should be a responsibility solely of the parser. This also leads to the
actual grammar file not reflecting correctly what the <em>actual</em> grammar is (that
is, the collection of all valid Python programs).</p>
<p>Similar workarounds appear in multiple other rules of the current grammar.
Sometimes this problem is unsolvable. For instance, <a class="reference external" href="https://github.com/python/cpython/issues/56991">bpo-12782: Multiple context
expressions do not support parentheses for continuation across lines</a> shows how making an LL(1) rule that supports
writing:</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">with</span> <span class="p">(</span>
    <span class="nc">open</span><span class="p">(</span><span class="s2">&quot;a_really_long_foo&quot;</span><span class="p">)</span> <span class="nc">as</span> <span class="nc">foo,</span>
    <span class="nc">open</span><span class="p">(</span><span class="s2">&quot;a_really_long_baz&quot;</span><span class="p">)</span> <span class="nc">as</span> <span class="nc">baz,</span>
    <span class="nc">open</span><span class="p">(</span><span class="s2">&quot;a_really_long_bar&quot;</span><span class="p">)</span> <span class="nc">as</span> <span class="nc">bar</span>
<span class="p">)</span><span class="o">:</span>
  <span class="k">...</span>
</pre></div>
</div>
<p>is not possible since the first sets of the grammar items that can
appear as context managers include the open parenthesis, making the rule
ambiguous. This rule is not only consistent with other parts of the language (like
the rule for multiple imports), but is also very useful to auto-formatting tools,
as parenthesized groups are normally used to group elements to be
formatted together (in the same way the tools operate on the contents of lists,
sets…).</p>
</section>
<section id="complicated-ast-parsing">
<h3><a class="toc-backref" href="#complicated-ast-parsing" role="doc-backlink">Complicated AST parsing</a></h3>
<p>Another problem of the current parser is that there is a huge coupling between the
AST generation routines and the particular shape of the produced parse trees. This
makes the code for generating the AST especially complicated as many actions and
choices are implicit. For instance, the AST generation code knows what
alternatives of a certain rule are produced based on the number of child nodes
present in a given parse node. This makes the code difficult to follow as this
property is not directly related to the grammar file and is influenced by
implementation details. As a result of this, a considerable amount of the AST
generation code needs to deal with inspecting and reasoning about the particular
shape of the parse trees that it receives.</p>
</section>
<section id="lack-of-left-recursion">
<h3><a class="toc-backref" href="#lack-of-left-recursion" role="doc-backlink">Lack of left recursion</a></h3>
<p>As described previously, a limitation of LL(1) grammars is that they cannot allow
left-recursion. This makes writing some rules very unnatural and far from how
programmers normally think about the program. For instance this construct (a simpler
variation of several rules present in the current grammar):</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">expr</span><span class="o">:</span> <span class="nc">expr</span> <span class="s1">&#39;+&#39;</span> <span class="nc">term</span> <span class="o">|</span> <span class="nc">term</span>
</pre></div>
</div>
<p>cannot be parsed by an LL(1) parser. The traditional remedy is to rewrite the
grammar to circumvent the problem:</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">expr</span><span class="o">:</span> <span class="nc">term</span> <span class="p">(</span><span class="s1">&#39;+&#39;</span> <span class="nc">term</span><span class="p">)</span><span class="o">*</span>
</pre></div>
</div>
<p>The problem that appears with this form is that the parse tree is forced to have a
very unnatural shape. This is because with this rule, for the input program <code class="docutils literal notranslate"><span class="pre">a</span> <span class="pre">+</span>
<span class="pre">b</span> <span class="pre">+</span> <span class="pre">c</span></code> the parse tree will be flattened (<code class="docutils literal notranslate"><span class="pre">['a',</span> <span class="pre">'+',</span> <span class="pre">'b',</span> <span class="pre">'+',</span> <span class="pre">'c']</span></code>) and must
be post-processed to construct a left-recursive parse tree (<code class="docutils literal notranslate"><span class="pre">[['a',</span> <span class="pre">'+',</span> <span class="pre">'b'],</span>
<span class="pre">'+',</span> <span class="pre">'c']</span></code>). Being forced to write the second rule not only leads to the parse
tree not correctly reflecting the desired associativity, but also imposes further
pressure on later compilation stages to detect and post-process these cases.</p>
</section>
<section id="intermediate-parse-tree">
<h3><a class="toc-backref" href="#intermediate-parse-tree" role="doc-backlink">Intermediate parse tree</a></h3>
<p>The last problem present in the current parser is the intermediate creation of a
parse tree or Concrete Syntax Tree that is later transformed to an Abstract Syntax
Tree. Although the construction of a CST is very common in parser and compiler
pipelines, in CPython this intermediate CST is not used by anything else (it is
only indirectly exposed by the <em>parser</em> module and a surprisingly small part of
the code in the CST production is reused in the module). Which is worse: the whole
tree is kept in memory, keeping many branches that consist of chains of nodes with
a single child. This has been shown to consume a considerable amount of memory (for
instance in <a class="reference external" href="https://github.com/python/cpython/issues/70603">bpo-26415: Excessive peak memory consumption by the Python
parser</a>).</p>
<p>Having to produce an intermediate result between the grammar and the AST is not only
undesirable but also makes the AST generation step much more complicated, raising
considerably the maintenance burden.</p>
</section>
</section>
<section id="the-new-proposed-peg-parser">
<h2><a class="toc-backref" href="#the-new-proposed-peg-parser" role="doc-backlink">The new proposed PEG parser</a></h2>
<p>The new proposed PEG parser contains the following pieces:</p>
<ul class="simple">
<li>A parser generator that can read a grammar file and produce a PEG parser
written in Python or C that can parse said grammar.</li>
<li>A PEG meta-grammar that automatically generates a Python parser that is used
for the parser generator itself (this means that there are no manually-written
parsers).</li>
<li>A generated parser (using the parser generator) that can directly produce C and
Python AST objects.</li>
</ul>
<section id="left-recursion">
<h3><a class="toc-backref" href="#left-recursion" role="doc-backlink">Left recursion</a></h3>
<p>PEG parsers normally do not support left recursion but we have implemented a
technique similar to the one described in Medeiros et al. <a class="footnote-reference brackets" href="#id11" id="id2">[2]</a> but using the
memoization cache instead of static variables. This approach is closer to the one
described in Warth et al. <a class="footnote-reference brackets" href="#id12" id="id3">[3]</a>. This allows us to write not only simple left-recursive
rules but also more complicated rules that involve indirect left-recursion like:</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule1</span><span class="o">:</span> <span class="nc">rule2</span> <span class="o">|</span> <span class="s1">&#39;a&#39;</span>
<span class="nc">rule2</span><span class="o">:</span> <span class="nc">rule3</span> <span class="o">|</span> <span class="s1">&#39;b&#39;</span>
<span class="nc">rule3</span><span class="o">:</span> <span class="nc">rule1</span> <span class="o">|</span> <span class="s1">&#39;c&#39;</span>
</pre></div>
</div>
<p>and “hidden left-recursion” like:</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule</span><span class="o">:</span> <span class="s1">&#39;optional&#39;</span><span class="o">?</span> <span class="nc">rule</span> <span class="s1">&#39;@&#39;</span> <span class="nc">some_other_rule</span>
</pre></div>
</div>
</section>
<section id="syntax">
<h3><a class="toc-backref" href="#syntax" role="doc-backlink">Syntax</a></h3>
<p>The grammar consists of a sequence of rules of the form:</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="o">:</span> <span class="nc">expression</span>
</pre></div>
</div>
<p>Optionally, a type can be included right after the rule name, which
specifies the return type of the C or Python function corresponding to
the rule:</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="p">[</span><span class="s">return_type</span><span class="p">]</span><span class="o">:</span> <span class="nc">expression</span>
</pre></div>
</div>
<p>If the return type is omitted, then a <code class="docutils literal notranslate"><span class="pre">void</span> <span class="pre">*</span></code> is returned in C and an
<code class="docutils literal notranslate"><span class="pre">Any</span></code> in Python.</p>
<section id="grammar-expressions">
<h4><a class="toc-backref" href="#grammar-expressions" role="doc-backlink">Grammar Expressions</a></h4>
<section id="comment">
<h5><a class="toc-backref" href="#comment" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">#</span> <span class="pre">comment</span></code></a></h5>
<p>Python-style comments.</p>
</section>
<section id="e1-e2">
<h5><a class="toc-backref" href="#e1-e2" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">e1</span> <span class="pre">e2</span></code></a></h5>
<p>Match e1, then match e2.</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="o">:</span> <span class="nc">first_rule</span> <span class="nc">second_rule</span>
</pre></div>
</div>
</section>
<section id="e1-e2-1">
<span id="id4"></span><h5><a class="toc-backref" href="#e1-e2-1" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">e1</span> <span class="pre">|</span> <span class="pre">e2</span></code></a></h5>
<p>Match e1 or e2.</p>
<p>The first alternative can also appear on the line after the rule name
for formatting purposes. In that case, a | must be used before the
first alternative, like so:</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="p">[</span><span class="s">return_type</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="nc">first_alt</span>
    <span class="o">|</span> <span class="nc">second_alt</span>
</pre></div>
</div>
</section>
<section id="e">
<h5><a class="toc-backref" href="#e" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">(</span> <span class="pre">e</span> <span class="pre">)</span></code></a></h5>
<p>Match e.</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="o">:</span> <span class="p">(</span><span class="nc">e</span><span class="p">)</span>
</pre></div>
</div>
<p>A slightly more complex and useful example includes using the grouping
operator together with the repeat operators:</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="o">:</span> <span class="p">(</span><span class="nc">e1</span> <span class="nc">e2</span><span class="p">)</span><span class="o">*</span>
</pre></div>
</div>
</section>
<section id="e-or-e">
<h5><a class="toc-backref" href="#e-or-e" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">[</span> <span class="pre">e</span> <span class="pre">]</span> <span class="pre">or</span> <span class="pre">e?</span></code></a></h5>
<p>Optionally match e.</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="o">:</span> <span class="p">[</span><span class="s">e</span><span class="p">]</span>
</pre></div>
</div>
<p>A more useful example includes defining that a trailing comma is
optional:</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="o">:</span> <span class="nc">e</span> <span class="p">(</span><span class="s1">&#39;,&#39;</span> <span class="nc">e</span><span class="p">)</span><span class="o">*</span> <span class="p">[</span><span class="s">&#39;,&#39;</span><span class="p">]</span>
</pre></div>
</div>
</section>
<section id="e-1">
<span id="id5"></span><h5><a class="toc-backref" href="#e-1" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">e*</span></code></a></h5>
<p>Match zero or more occurrences of e.</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="o">:</span> <span class="p">(</span><span class="nc">e1</span> <span class="nc">e2</span><span class="p">)</span><span class="o">*</span>
</pre></div>
</div>
</section>
<section id="e-2">
<span id="id6"></span><h5><a class="toc-backref" href="#e-2" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">e+</span></code></a></h5>
<p>Match one or more occurrences of e.</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="o">:</span> <span class="p">(</span><span class="nc">e1</span> <span class="nc">e2</span><span class="p">)</span><span class="o">+</span>
</pre></div>
</div>
</section>
<section id="s-e">
<h5><a class="toc-backref" href="#s-e" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">s.e+</span></code></a></h5>
<p>Match one or more occurrences of e, separated by s. The generated parse
tree does not include the separator. This is otherwise identical to
<code class="docutils literal notranslate"><span class="pre">(e</span> <span class="pre">(s</span> <span class="pre">e)*)</span></code>.</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="o">:</span> <span class="s1">&#39;,&#39;</span><span class="k">.</span><span class="nc">e</span><span class="o">+</span>
</pre></div>
</div>
</section>
<section id="e-3">
<span id="id7"></span><h5><a class="toc-backref" href="#e-3" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">&amp;e</span></code></a></h5>
<p>Succeed if e can be parsed, without consuming any input.</p>
</section>
<section id="e-4">
<span id="id8"></span><h5><a class="toc-backref" href="#e-4" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">!e</span></code></a></h5>
<p>Fail if e can be parsed, without consuming any input.</p>
<p>An example taken from the proposed Python grammar specifies that a primary
consists of an atom, which is not followed by a <code class="docutils literal notranslate"><span class="pre">.</span></code> or a <code class="docutils literal notranslate"><span class="pre">(</span></code> or a
<code class="docutils literal notranslate"><span class="pre">[</span></code>:</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">primary</span><span class="o">:</span> <span class="nc">atom</span> <span class="o">!</span><span class="s1">&#39;.&#39;</span> <span class="o">!</span><span class="s1">&#39;(&#39;</span> <span class="o">!</span><span class="s1">&#39;[&#39;</span>
</pre></div>
</div>
</section>
<section id="e-5">
<span id="id9"></span><h5><a class="toc-backref" href="#e-5" role="doc-backlink"><code class="docutils literal notranslate"><span class="pre">~</span></code></a></h5>
<p>Commit to the current alternative, even if it fails to parse.</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="o">:</span> <span class="s1">&#39;(&#39;</span> <span class="o">~</span> <span class="nc">some_rule</span> <span class="s1">&#39;)&#39;</span> <span class="o">|</span> <span class="nc">some_alt</span>
</pre></div>
</div>
<p>In this example, if a left parenthesis is parsed, then the other
alternative won’t be considered, even if some_rule or ‘)’ fail to be
parsed.</p>
</section>
</section>
<section id="variables-in-the-grammar">
<h4><a class="toc-backref" href="#variables-in-the-grammar" role="doc-backlink">Variables in the Grammar</a></h4>
<p>A subexpression can be named by preceding it with an identifier and an
<code class="docutils literal notranslate"><span class="pre">=</span></code> sign. The name can then be used in the action (see below), like this:</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="p">[</span><span class="s">return_type</span><span class="p">]</span><span class="o">:</span> <span class="s1">&#39;(&#39;</span> <span class="nc">a</span><span class="o">=</span><span class="nc">some_other_rule</span> <span class="s1">&#39;)&#39;</span> <span class="nc">{</span> <span class="nc">a</span> <span class="nc">}</span>
</pre></div>
</div>
</section>
</section>
<section id="grammar-actions">
<h3><a class="toc-backref" href="#grammar-actions" role="doc-backlink">Grammar actions</a></h3>
<p>To avoid the intermediate steps that obscure the relationship between the
grammar and the AST generation the proposed PEG parser allows directly
generating AST nodes for a rule via grammar actions. Grammar actions are
language-specific expressions that are evaluated when a grammar rule is
successfully parsed. These expressions can be written in Python or C
depending on the desired output of the parser generator. This means that if
one would want to generate a parser in Python and another in C, two grammar
files should be written, each one with a different set of actions, keeping
everything else apart from said actions identical in both files. As an
example of a grammar with Python actions, the piece of the parser generator
that parses grammar files is bootstrapped from a meta-grammar file with
Python actions that generate the grammar tree as a result of the parsing.</p>
<p>In the specific case of the new proposed PEG grammar for Python, having
actions allows directly describing how the AST is composed in the grammar
itself, making it more clear and maintainable. This AST generation process is
supported by the use of some helper functions that factor out common AST
object manipulations and some other required operations that are not directly
related to the grammar.</p>
<p>To indicate these actions each alternative can be followed by the action code
inside curly-braces, which specifies the return value of the alternative:</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">rule_name</span><span class="p">[</span><span class="s">return_type</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="nc">first_alt1</span> <span class="nc">first_alt2</span> <span class="nc">{</span> <span class="nc">first_alt1</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">second_alt1</span> <span class="nc">second_alt2</span> <span class="nc">{</span> <span class="nc">second_alt1</span> <span class="nc">}</span>
</pre></div>
</div>
<p>If the action is omitted and C code is being generated, then there are two
different possibilities:</p>
<ol class="arabic simple">
<li>If there’s a single name in the alternative, this gets returned.</li>
<li>If not, a dummy name object gets returned (this case should be avoided).</li>
</ol>
<p>If the action is omitted and Python code is being generated, then a list
with all the parsed expressions gets returned (this is meant for debugging).</p>
<p>The full meta-grammar for the grammars supported by the PEG generator is:</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">start</span><span class="p">[</span><span class="s">Grammar</span><span class="p">]</span><span class="o">:</span> <span class="nc">grammar</span> <span class="nc">ENDMARKER</span> <span class="nc">{</span> <span class="nc">grammar</span> <span class="nc">}</span>

<span class="nc">grammar</span><span class="p">[</span><span class="s">Grammar</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="nc">metas</span> <span class="nc">rules</span> <span class="nc">{</span> <span class="nc">Grammar</span><span class="p">(</span><span class="nc">rules,</span> <span class="nc">metas</span><span class="p">)</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">rules</span> <span class="nc">{</span> <span class="nc">Grammar</span><span class="p">(</span><span class="nc">rules,</span> <span class="p">[])</span> <span class="nc">}</span>

<span class="nc">metas</span><span class="p">[</span><span class="s">MetaList</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="nc">meta</span> <span class="nc">metas</span> <span class="nc">{</span> <span class="p">[</span><span class="s">meta</span><span class="p">]</span> <span class="o">+</span> <span class="nc">metas</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">meta</span> <span class="nc">{</span> <span class="p">[</span><span class="s">meta</span><span class="p">]</span> <span class="nc">}</span>

<span class="nc">meta</span><span class="p">[</span><span class="s">MetaTuple</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="s2">&quot;@&quot;</span> <span class="nc">NAME</span> <span class="nc">NEWLINE</span> <span class="nc">{</span> <span class="p">(</span><span class="nc">name.string,</span> <span class="nc">None</span><span class="p">)</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="s2">&quot;@&quot;</span> <span class="nc">a</span><span class="o">=</span><span class="nc">NAME</span> <span class="nc">b</span><span class="o">=</span><span class="nc">NAME</span> <span class="nc">NEWLINE</span> <span class="nc">{</span> <span class="p">(</span><span class="nc">a.string,</span> <span class="nc">b.string</span><span class="p">)</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="s2">&quot;@&quot;</span> <span class="nc">NAME</span> <span class="nc">STRING</span> <span class="nc">NEWLINE</span> <span class="nc">{</span> <span class="p">(</span><span class="nc">name.string,</span> <span class="nc">literal_eval</span><span class="p">(</span><span class="nc">string.string</span><span class="p">))</span> <span class="nc">}</span>

<span class="nc">rules</span><span class="p">[</span><span class="s">RuleList</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="nc">rule</span> <span class="nc">rules</span> <span class="nc">{</span> <span class="p">[</span><span class="s">rule</span><span class="p">]</span> <span class="o">+</span> <span class="nc">rules</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">rule</span> <span class="nc">{</span> <span class="p">[</span><span class="s">rule</span><span class="p">]</span> <span class="nc">}</span>

<span class="nc">rule</span><span class="p">[</span><span class="s">Rule</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="nc">rulename</span> <span class="s2">&quot;:&quot;</span> <span class="nc">alts</span> <span class="nc">NEWLINE</span> <span class="nc">INDENT</span> <span class="nc">more_alts</span> <span class="nc">DEDENT</span> <span class="nc">{</span>
          <span class="nc">Rule</span><span class="p">(</span><span class="nc">rulename</span><span class="p">[</span><span class="s">0</span><span class="p">]</span><span class="nc">,</span> <span class="nc">rulename</span><span class="p">[</span><span class="s">1</span><span class="p">]</span><span class="nc">,</span> <span class="nc">Rhs</span><span class="p">(</span><span class="nc">alts.alts</span> <span class="o">+</span> <span class="nc">more_alts.alts</span><span class="p">))</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">rulename</span> <span class="s2">&quot;:&quot;</span> <span class="nc">NEWLINE</span> <span class="nc">INDENT</span> <span class="nc">more_alts</span> <span class="nc">DEDENT</span> <span class="nc">{</span> <span class="nc">Rule</span><span class="p">(</span><span class="nc">rulename</span><span class="p">[</span><span class="s">0</span><span class="p">]</span><span class="nc">,</span> <span class="nc">rulename</span><span class="p">[</span><span class="s">1</span><span class="p">]</span><span class="nc">,</span> <span class="nc">more_alts</span><span class="p">)</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">rulename</span> <span class="s2">&quot;:&quot;</span> <span class="nc">alts</span> <span class="nc">NEWLINE</span> <span class="nc">{</span> <span class="nc">Rule</span><span class="p">(</span><span class="nc">rulename</span><span class="p">[</span><span class="s">0</span><span class="p">]</span><span class="nc">,</span> <span class="nc">rulename</span><span class="p">[</span><span class="s">1</span><span class="p">]</span><span class="nc">,</span> <span class="nc">alts</span><span class="p">)</span> <span class="nc">}</span>

<span class="nc">rulename</span><span class="p">[</span><span class="s">RuleName</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="nc">NAME</span> <span class="s1">&#39;[&#39;</span> <span class="nc">type</span><span class="o">=</span><span class="nc">NAME</span> <span class="s1">&#39;*&#39;</span> <span class="s1">&#39;]&#39;</span> <span class="nc">{</span><span class="p">(</span><span class="nc">name.string,</span> <span class="nc">type.string</span><span class="o">+</span><span class="s2">&quot;*&quot;</span><span class="p">)</span><span class="nc">}</span>
    <span class="o">|</span> <span class="nc">NAME</span> <span class="s1">&#39;[&#39;</span> <span class="nc">type</span><span class="o">=</span><span class="nc">NAME</span> <span class="s1">&#39;]&#39;</span> <span class="nc">{</span><span class="p">(</span><span class="nc">name.string,</span> <span class="nc">type.string</span><span class="p">)</span><span class="nc">}</span>
    <span class="o">|</span> <span class="nc">NAME</span> <span class="nc">{</span><span class="p">(</span><span class="nc">name.string,</span> <span class="nc">None</span><span class="p">)</span><span class="nc">}</span>

<span class="nc">alts</span><span class="p">[</span><span class="s">Rhs</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="nc">alt</span> <span class="s2">&quot;|&quot;</span> <span class="nc">alts</span> <span class="nc">{</span> <span class="nc">Rhs</span><span class="p">([</span><span class="s">alt</span><span class="p">]</span> <span class="o">+</span> <span class="nc">alts.alts</span><span class="p">)</span><span class="nc">}</span>
    <span class="o">|</span> <span class="nc">alt</span> <span class="nc">{</span> <span class="nc">Rhs</span><span class="p">([</span><span class="s">alt</span><span class="p">])</span> <span class="nc">}</span>

<span class="nc">more_alts</span><span class="p">[</span><span class="s">Rhs</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="s2">&quot;|&quot;</span> <span class="nc">alts</span> <span class="nc">NEWLINE</span> <span class="nc">more_alts</span> <span class="nc">{</span> <span class="nc">Rhs</span><span class="p">(</span><span class="nc">alts.alts</span> <span class="o">+</span> <span class="nc">more_alts.alts</span><span class="p">)</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="s2">&quot;|&quot;</span> <span class="nc">alts</span> <span class="nc">NEWLINE</span> <span class="nc">{</span> <span class="nc">Rhs</span><span class="p">(</span><span class="nc">alts.alts</span><span class="p">)</span> <span class="nc">}</span>

<span class="nc">alt</span><span class="p">[</span><span class="s">Alt</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="nc">items</span> <span class="s1">&#39;$&#39;</span> <span class="nc">action</span> <span class="nc">{</span> <span class="nc">Alt</span><span class="p">(</span><span class="nc">items</span> <span class="o">+</span> <span class="p">[</span><span class="s">NamedItem(None, NameLeaf(&#39;ENDMARKER&#39;))</span><span class="p">]</span><span class="nc">,</span> <span class="nc">action</span><span class="o">=</span><span class="nc">action</span><span class="p">)</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">items</span> <span class="s1">&#39;$&#39;</span> <span class="nc">{</span> <span class="nc">Alt</span><span class="p">(</span><span class="nc">items</span> <span class="o">+</span> <span class="p">[</span><span class="s">NamedItem(None, NameLeaf(&#39;ENDMARKER&#39;))</span><span class="p">]</span><span class="nc">,</span> <span class="nc">action</span><span class="o">=</span><span class="nc">None</span><span class="p">)</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">items</span> <span class="nc">action</span> <span class="nc">{</span> <span class="nc">Alt</span><span class="p">(</span><span class="nc">items,</span> <span class="nc">action</span><span class="o">=</span><span class="nc">action</span><span class="p">)</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">items</span> <span class="nc">{</span> <span class="nc">Alt</span><span class="p">(</span><span class="nc">items,</span> <span class="nc">action</span><span class="o">=</span><span class="nc">None</span><span class="p">)</span> <span class="nc">}</span>

<span class="nc">items</span><span class="p">[</span><span class="s">NamedItemList</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="nc">named_item</span> <span class="nc">items</span> <span class="nc">{</span> <span class="p">[</span><span class="s">named_item</span><span class="p">]</span> <span class="o">+</span> <span class="nc">items</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">named_item</span> <span class="nc">{</span> <span class="p">[</span><span class="s">named_item</span><span class="p">]</span> <span class="nc">}</span>

<span class="nc">named_item</span><span class="p">[</span><span class="s">NamedItem</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="nc">NAME</span> <span class="s1">&#39;=&#39;</span> <span class="o">~</span> <span class="nc">item</span> <span class="nc">{NamedItem</span><span class="p">(</span><span class="nc">name.string,</span> <span class="nc">item</span><span class="p">)</span><span class="nc">}</span>
    <span class="o">|</span> <span class="nc">item</span> <span class="nc">{NamedItem</span><span class="p">(</span><span class="nc">None,</span> <span class="nc">item</span><span class="p">)</span><span class="nc">}</span>
    <span class="o">|</span> <span class="nc">it</span><span class="o">=</span><span class="nc">lookahead</span> <span class="nc">{NamedItem</span><span class="p">(</span><span class="nc">None,</span> <span class="nc">it</span><span class="p">)</span><span class="nc">}</span>

<span class="nc">lookahead</span><span class="p">[</span><span class="s">LookaheadOrCut</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="s1">&#39;&amp;&#39;</span> <span class="o">~</span> <span class="nc">atom</span> <span class="nc">{PositiveLookahead</span><span class="p">(</span><span class="nc">atom</span><span class="p">)</span><span class="nc">}</span>
    <span class="o">|</span> <span class="s1">&#39;!&#39;</span> <span class="o">~</span> <span class="nc">atom</span> <span class="nc">{NegativeLookahead</span><span class="p">(</span><span class="nc">atom</span><span class="p">)</span><span class="nc">}</span>
    <span class="o">|</span> <span class="s1">&#39;~&#39;</span> <span class="nc">{Cut</span><span class="p">()</span><span class="nc">}</span>

<span class="nc">item</span><span class="p">[</span><span class="s">Item</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="s1">&#39;[&#39;</span> <span class="o">~</span> <span class="nc">alts</span> <span class="s1">&#39;]&#39;</span> <span class="nc">{Opt</span><span class="p">(</span><span class="nc">alts</span><span class="p">)</span><span class="nc">}</span>
    <span class="o">|</span>  <span class="nc">atom</span> <span class="s1">&#39;?&#39;</span> <span class="nc">{Opt</span><span class="p">(</span><span class="nc">atom</span><span class="p">)</span><span class="nc">}</span>
    <span class="o">|</span>  <span class="nc">atom</span> <span class="s1">&#39;*&#39;</span> <span class="nc">{Repeat0</span><span class="p">(</span><span class="nc">atom</span><span class="p">)</span><span class="nc">}</span>
    <span class="o">|</span>  <span class="nc">atom</span> <span class="s1">&#39;+&#39;</span> <span class="nc">{Repeat1</span><span class="p">(</span><span class="nc">atom</span><span class="p">)</span><span class="nc">}</span>
    <span class="o">|</span>  <span class="nc">sep</span><span class="o">=</span><span class="nc">atom</span> <span class="s1">&#39;.&#39;</span> <span class="nc">node</span><span class="o">=</span><span class="nc">atom</span> <span class="s1">&#39;+&#39;</span> <span class="nc">{Gather</span><span class="p">(</span><span class="nc">sep,</span> <span class="nc">node</span><span class="p">)</span><span class="nc">}</span>
    <span class="o">|</span>  <span class="nc">atom</span> <span class="nc">{atom}</span>

<span class="nc">atom</span><span class="p">[</span><span class="s">Plain</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="s1">&#39;(&#39;</span> <span class="o">~</span> <span class="nc">alts</span> <span class="s1">&#39;)&#39;</span> <span class="nc">{Group</span><span class="p">(</span><span class="nc">alts</span><span class="p">)</span><span class="nc">}</span>
    <span class="o">|</span> <span class="nc">NAME</span> <span class="nc">{NameLeaf</span><span class="p">(</span><span class="nc">name.string</span><span class="p">)</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">STRING</span> <span class="nc">{StringLeaf</span><span class="p">(</span><span class="nc">string.string</span><span class="p">)</span><span class="nc">}</span>

<span class="c1"># Mini-grammar for the actions</span>

<span class="nc">action</span><span class="p">[</span><span class="s">str</span><span class="p">]</span><span class="o">:</span> <span class="s2">&quot;{&quot;</span> <span class="o">~</span> <span class="nc">target_atoms</span> <span class="s2">&quot;}&quot;</span> <span class="nc">{</span> <span class="nc">target_atoms</span> <span class="nc">}</span>

<span class="nc">target_atoms</span><span class="p">[</span><span class="s">str</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="nc">target_atom</span> <span class="nc">target_atoms</span> <span class="nc">{</span> <span class="nc">target_atom</span> <span class="o">+</span> <span class="s2">&quot; &quot;</span> <span class="o">+</span> <span class="nc">target_atoms</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">target_atom</span> <span class="nc">{</span> <span class="nc">target_atom</span> <span class="nc">}</span>

<span class="nc">target_atom</span><span class="p">[</span><span class="s">str</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="s2">&quot;{&quot;</span> <span class="o">~</span> <span class="nc">target_atoms</span> <span class="s2">&quot;}&quot;</span> <span class="nc">{</span> <span class="s2">&quot;{&quot;</span> <span class="o">+</span> <span class="nc">target_atoms</span> <span class="o">+</span> <span class="s2">&quot;}&quot;</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">NAME</span> <span class="nc">{</span> <span class="nc">name.string</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">NUMBER</span> <span class="nc">{</span> <span class="nc">number.string</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">STRING</span> <span class="nc">{</span> <span class="nc">string.string</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="s2">&quot;?&quot;</span> <span class="nc">{</span> <span class="s2">&quot;?&quot;</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="s2">&quot;:&quot;</span> <span class="nc">{</span> <span class="s2">&quot;:&quot;</span> <span class="nc">}</span>
</pre></div>
</div>
<p>As an illustrative example this simple grammar file allows directly
generating a full parser that can parse simple arithmetic expressions and that
returns a valid C-based Python AST:</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">start</span><span class="p">[</span><span class="s">mod_ty</span><span class="p">]</span><span class="o">:</span> <span class="nc">a</span><span class="o">=</span><span class="nc">expr_stmt</span><span class="o">*</span> <span class="nc">$</span> <span class="nc">{</span> <span class="nc">Module</span><span class="p">(</span><span class="nc">a,</span> <span class="nc">NULL,</span> <span class="nc">p-&gt;arena</span><span class="p">)</span> <span class="nc">}</span>
<span class="nc">expr_stmt</span><span class="p">[</span><span class="s">stmt_ty</span><span class="p">]</span><span class="o">:</span> <span class="nc">a</span><span class="o">=</span><span class="nc">expr</span> <span class="nc">NEWLINE</span> <span class="nc">{</span> <span class="nc">_Py_Expr</span><span class="p">(</span><span class="nc">a,</span> <span class="nc">EXTRA</span><span class="p">)</span> <span class="nc">}</span>
<span class="nc">expr</span><span class="p">[</span><span class="s">expr_ty</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="nc">l</span><span class="o">=</span><span class="nc">expr</span> <span class="s1">&#39;+&#39;</span> <span class="nc">r</span><span class="o">=</span><span class="nc">term</span> <span class="nc">{</span> <span class="nc">_Py_BinOp</span><span class="p">(</span><span class="nc">l,</span> <span class="nc">Add,</span> <span class="nc">r,</span> <span class="nc">EXTRA</span><span class="p">)</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">l</span><span class="o">=</span><span class="nc">expr</span> <span class="s1">&#39;-&#39;</span> <span class="nc">r</span><span class="o">=</span><span class="nc">term</span> <span class="nc">{</span> <span class="nc">_Py_BinOp</span><span class="p">(</span><span class="nc">l,</span> <span class="nc">Sub,</span> <span class="nc">r,</span> <span class="nc">EXTRA</span><span class="p">)</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">t</span><span class="o">=</span><span class="nc">term</span> <span class="nc">{</span> <span class="nc">t</span> <span class="nc">}</span>

<span class="nc">term</span><span class="p">[</span><span class="s">expr_ty</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="nc">l</span><span class="o">=</span><span class="nc">term</span> <span class="s1">&#39;*&#39;</span> <span class="nc">r</span><span class="o">=</span><span class="nc">factor</span> <span class="nc">{</span> <span class="nc">_Py_BinOp</span><span class="p">(</span><span class="nc">l,</span> <span class="nc">Mult,</span> <span class="nc">r,</span> <span class="nc">EXTRA</span><span class="p">)</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">l</span><span class="o">=</span><span class="nc">term</span> <span class="s1">&#39;/&#39;</span> <span class="nc">r</span><span class="o">=</span><span class="nc">factor</span> <span class="nc">{</span> <span class="nc">_Py_BinOp</span><span class="p">(</span><span class="nc">l,</span> <span class="nc">Div,</span> <span class="nc">r,</span> <span class="nc">EXTRA</span><span class="p">)</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">f</span><span class="o">=</span><span class="nc">factor</span> <span class="nc">{</span> <span class="nc">f</span> <span class="nc">}</span>

<span class="nc">factor</span><span class="p">[</span><span class="s">expr_ty</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="s1">&#39;(&#39;</span> <span class="nc">e</span><span class="o">=</span><span class="nc">expr</span> <span class="s1">&#39;)&#39;</span> <span class="nc">{</span> <span class="nc">e</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">a</span><span class="o">=</span><span class="nc">atom</span> <span class="nc">{</span> <span class="nc">a</span> <span class="nc">}</span>

<span class="nc">atom</span><span class="p">[</span><span class="s">expr_ty</span><span class="p">]</span><span class="o">:</span>
    <span class="o">|</span> <span class="nc">n</span><span class="o">=</span><span class="nc">NAME</span> <span class="nc">{</span> <span class="nc">n</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">n</span><span class="o">=</span><span class="nc">NUMBER</span> <span class="nc">{</span> <span class="nc">n</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">s</span><span class="o">=</span><span class="nc">STRING</span> <span class="nc">{</span> <span class="nc">s</span> <span class="nc">}</span>
</pre></div>
</div>
<p>Here <code class="docutils literal notranslate"><span class="pre">EXTRA</span></code> is a macro that expands to <code class="docutils literal notranslate"><span class="pre">start_lineno,</span> <span class="pre">start_col_offset,</span>
<span class="pre">end_lineno,</span> <span class="pre">end_col_offset,</span> <span class="pre">p-&gt;arena</span></code>, those being variables automatically
injected by the parser; <code class="docutils literal notranslate"><span class="pre">p</span></code> points to an object that holds on to all state
for the parser.</p>
<p>A similar grammar written to target Python AST objects:</p>
<div class="highlight-PEG notranslate"><div class="highlight"><pre><span></span><span class="nc">start</span><span class="o">:</span> <span class="nc">expr</span> <span class="nc">NEWLINE</span><span class="o">?</span> <span class="nc">ENDMARKER</span> <span class="nc">{</span> <span class="nc">ast.Expression</span><span class="p">(</span><span class="nc">expr</span><span class="p">)</span> <span class="nc">}</span>
<span class="nc">expr</span><span class="o">:</span>
    <span class="o">|</span> <span class="nc">expr</span> <span class="s1">&#39;+&#39;</span> <span class="nc">term</span> <span class="nc">{</span> <span class="nc">ast.BinOp</span><span class="p">(</span><span class="nc">expr,</span> <span class="nc">ast.Add</span><span class="p">()</span><span class="nc">,</span> <span class="nc">term</span><span class="p">)</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">expr</span> <span class="s1">&#39;-&#39;</span> <span class="nc">term</span> <span class="nc">{</span> <span class="nc">ast.BinOp</span><span class="p">(</span><span class="nc">expr,</span> <span class="nc">ast.Sub</span><span class="p">()</span><span class="nc">,</span> <span class="nc">term</span><span class="p">)</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">term</span> <span class="nc">{</span> <span class="nc">term</span> <span class="nc">}</span>

<span class="nc">term</span><span class="o">:</span>
    <span class="o">|</span> <span class="nc">l</span><span class="o">=</span><span class="nc">term</span> <span class="s1">&#39;*&#39;</span> <span class="nc">r</span><span class="o">=</span><span class="nc">factor</span> <span class="nc">{</span> <span class="nc">ast.BinOp</span><span class="p">(</span><span class="nc">l,</span> <span class="nc">ast.Mult</span><span class="p">()</span><span class="nc">,</span> <span class="nc">r</span><span class="p">)</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">term</span> <span class="s1">&#39;/&#39;</span> <span class="nc">factor</span> <span class="nc">{</span> <span class="nc">ast.BinOp</span><span class="p">(</span><span class="nc">term,</span> <span class="nc">ast.Div</span><span class="p">()</span><span class="nc">,</span> <span class="nc">factor</span><span class="p">)</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">factor</span> <span class="nc">{</span> <span class="nc">factor</span> <span class="nc">}</span>

<span class="nc">factor</span><span class="o">:</span>
    <span class="o">|</span> <span class="s1">&#39;(&#39;</span> <span class="nc">expr</span> <span class="s1">&#39;)&#39;</span> <span class="nc">{</span> <span class="nc">expr</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">atom</span> <span class="nc">{</span> <span class="nc">atom</span> <span class="nc">}</span>

<span class="nc">atom</span><span class="o">:</span>
    <span class="o">|</span> <span class="nc">NAME</span> <span class="nc">{</span> <span class="nc">ast.Name</span><span class="p">(</span><span class="nc">id</span><span class="o">=</span><span class="nc">name.string,</span> <span class="nc">ctx</span><span class="o">=</span><span class="nc">ast.Load</span><span class="p">())</span> <span class="nc">}</span>
    <span class="o">|</span> <span class="nc">NUMBER</span> <span class="nc">{</span> <span class="nc">ast.Constant</span><span class="p">(</span><span class="nc">value</span><span class="o">=</span><span class="nc">ast.literal_eval</span><span class="p">(</span><span class="nc">number.string</span><span class="p">))</span> <span class="nc">}</span>
</pre></div>
</div>
</section>
</section>
<section id="migration-plan">
<h2><a class="toc-backref" href="#migration-plan" role="doc-backlink">Migration plan</a></h2>
<p>This section describes the migration plan when porting to the new PEG-based parser
if this PEP is accepted. The migration will be executed in a series of steps that allow
initially to fallback to the previous parser if needed:</p>
<ol class="arabic simple">
<li>Starting with Python 3.9 alpha 6, include the new PEG-based parser machinery in CPython
with a command-line flag and environment variable that allows switching between
the new and the old parsers together with explicit APIs that allow invoking the
new and the old parsers independently. At this step, all Python APIs like <code class="docutils literal notranslate"><span class="pre">ast.parse</span></code>
and <code class="docutils literal notranslate"><span class="pre">compile</span></code> will use the parser set by the flags or the environment variable and
the default parser will be the new PEG-based parser.</li>
<li>Between Python 3.9 and Python 3.10, the old parser and related code (like the
“parser” module) will be kept until a new Python release happens (Python 3.10). In
the meanwhile and until the old parser is removed, <strong>no new Python Grammar
addition will be added that requires the PEG parser</strong>. This means that the grammar
will be kept LL(1) until the old parser is removed.</li>
<li>In Python 3.10, remove the old parser, the command-line flag, the environment
variable and the “parser” module and related code.</li>
</ol>
</section>
<section id="performance-and-validation">
<h2><a class="toc-backref" href="#performance-and-validation" role="doc-backlink">Performance and validation</a></h2>
<p>We have done extensive timing and validation of the new parser, and
this gives us confidence that the new parser is of high enough quality
to replace the current parser.</p>
<section id="validation">
<h3><a class="toc-backref" href="#validation" role="doc-backlink">Validation</a></h3>
<p>To start with validation, we regularly compile the entire Python 3.8
stdlib and compare every aspect of the resulting AST with that
produced by the standard compiler. (In the process we found a few bugs
in the standard parser’s treatment of line and column numbers, which
we have all fixed upstream via a series of issues and PRs.)</p>
<p>We have also occasionally compiled a much larger codebase (the approx.
3800 most popular packages on PyPI) and this has helped us find a (very)
few additional bugs in the new parser.</p>
<p>(One area we have not explored extensively is rejection of all wrong
programs. We have unit tests that check for a certain number of
explicit rejections, but more work could be done, e.g. by using a
fuzzer that inserts random subtle bugs into existing code. We’re open
to help in this area.)</p>
</section>
<section id="performance">
<h3><a class="toc-backref" href="#performance" role="doc-backlink">Performance</a></h3>
<p>We have tuned the performance of the new parser to come within 10% of
the current parser both in speed and memory consumption. While the
PEG/packrat parsing algorithm inherently consumes more memory than the
current LL(1) parser, we have an advantage because we don’t construct
an intermediate CST.</p>
<p>Below are some benchmarks. These are focused on compiling source code
to bytecode, because this is the most realistic situation. Returning
an AST to Python code is not as representative, because the process to
convert the <em>internal</em> AST (only accessible to C code) to an
<em>external</em> AST (an instance of <code class="docutils literal notranslate"><span class="pre">ast.AST</span></code>) takes more time than the
parser itself.</p>
<p>All measurements reported here are done on a recent MacBook Pro,
taking the median of three runs. No particular care was taken to stop
other applications running on the same machine.</p>
<p>The first timings are for our canonical test file, which has 100,000
lines endlessly repeating the following three lines:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="mi">1</span> <span class="o">+</span> <span class="mi">2</span> <span class="o">+</span> <span class="mi">4</span> <span class="o">+</span> <span class="mi">5</span> <span class="o">+</span> <span class="mi">6</span> <span class="o">+</span> <span class="mi">7</span> <span class="o">+</span> <span class="mi">8</span> <span class="o">+</span> <span class="mi">9</span> <span class="o">+</span> <span class="mi">10</span> <span class="o">+</span> <span class="p">((((((</span><span class="mi">11</span> <span class="o">*</span> <span class="mi">12</span> <span class="o">*</span> <span class="mi">13</span> <span class="o">*</span> <span class="mi">14</span> <span class="o">*</span> <span class="mi">15</span> <span class="o">+</span> <span class="mi">16</span> <span class="o">*</span> <span class="mi">17</span> <span class="o">+</span> <span class="mi">18</span> <span class="o">*</span> <span class="mi">19</span> <span class="o">*</span> <span class="mi">20</span><span class="p">))))))</span>
<span class="mi">2</span><span class="o">*</span><span class="mi">3</span> <span class="o">+</span> <span class="mi">4</span><span class="o">*</span><span class="mi">5</span><span class="o">*</span><span class="mi">6</span>
<span class="mi">12</span> <span class="o">+</span> <span class="p">(</span><span class="mi">2</span> <span class="o">*</span> <span class="mi">3</span> <span class="o">*</span> <span class="mi">4</span> <span class="o">*</span> <span class="mi">5</span> <span class="o">+</span> <span class="mi">6</span> <span class="o">+</span> <span class="mi">7</span> <span class="o">*</span> <span class="mi">8</span><span class="p">)</span>
</pre></div>
</div>
<ul class="simple">
<li>Just parsing and throwing away the internal AST takes 1.16 seconds
with a max RSS of 681 MiB.</li>
<li>Parsing and converting to <code class="docutils literal notranslate"><span class="pre">ast.AST</span></code> takes 6.34 seconds, max RSS
1029 MiB.</li>
<li>Parsing and compiling to bytecode takes 1.28 seconds, max RSS 681
MiB.</li>
<li>With the current parser, parsing and compiling takes 1.44 seconds,
max RSS 836 MiB.</li>
</ul>
<p>For this particular test file, the new parser is faster and uses less
memory than the current parser (compare the last two bullets).</p>
<p>We also did timings with a more realistic payload, the entire Python
3.8 stdlib. This payload consists of 1,641 files, 749,570 lines,
27,622,497 bytes. (Though 11 files can’t be compiled by any Python 3
parser due to encoding issues, sometimes intentional.)</p>
<ul class="simple">
<li>Compiling and throwing away the internal AST took 2.141 seconds.
That’s 350,040 lines/sec, or 12,899,367 bytes/sec. The max RSS was
74 MiB (the largest file in the stdlib is much smaller than our
canonical test file).</li>
<li>Compiling to bytecode took 3.290 seconds. That’s 227,861 lines/sec,
or 8,396,942 bytes/sec. Max RSS 77 MiB.</li>
<li>Compiling to bytecode using the current parser took 3.367 seconds.
That’s 222,620 lines/sec, or 8,203,780 bytes/sec. Max RSS 70 MiB.</li>
</ul>
<p>Comparing the last two bullets we find that the new parser is slightly
faster but uses slightly (about 10%) more memory. We believe this is
acceptable. (Also, there are probably some more tweaks we can make to
reduce memory usage.)</p>
</section>
</section>
<section id="rejected-alternatives">
<h2><a class="toc-backref" href="#rejected-alternatives" role="doc-backlink">Rejected Alternatives</a></h2>
<p>We did not seriously consider alternative ways to implement the new
parser, but here’s a brief discussion of LALR(1).</p>
<p>Thirty years ago the first author decided to go his own way with
Python’s parser rather than using LALR(1), which was the industry
standard at the time (e.g. Bison and Yacc).  The reasons were
primarily emotional (gut feelings, intuition), based on past experience
using Yacc in other projects, where grammar development took more
effort than anticipated (in part due to shift-reduce conflicts).  A
specific criticism of Bison and Yacc that still holds is that their
meta-grammar (the notation used to feed the grammar into the parser
generator) does not support EBNF conveniences like
<code class="docutils literal notranslate"><span class="pre">[optional_clause]</span></code> or <code class="docutils literal notranslate"><span class="pre">(repeated_clause)*</span></code>.  Using a custom
parser generator, a syntax tree matching the structure of the grammar
could be generated automatically, and with EBNF that tree could match
the “human-friendly” structure of the grammar.</p>
<p>Other variants of LR were not considered, nor was LL (e.g. ANTLR).
PEG was selected because it was easy to understand given a basic
understanding of recursive-descent parsing.</p>
</section>
<section id="references">
<h2><a class="toc-backref" href="#references" role="doc-backlink">References</a></h2>
<aside class="footnote-list brackets">
<aside class="footnote brackets" id="id10" role="doc-footnote">
<dt class="label" id="id10">[<a href="#id1">1</a>]</dt>
<dd>Ford, Bryan
<a class="reference external" href="https://pdos.csail.mit.edu/~baford/packrat/thesis/">https://pdos.csail.mit.edu/~baford/packrat/thesis/</a></aside>
<aside class="footnote brackets" id="id11" role="doc-footnote">
<dt class="label" id="id11">[<a href="#id2">2</a>]</dt>
<dd>Medeiros et al.
<a class="reference external" href="https://arxiv.org/pdf/1207.0443.pdf">https://arxiv.org/pdf/1207.0443.pdf</a></aside>
<aside class="footnote brackets" id="id12" role="doc-footnote">
<dt class="label" id="id12">[<a href="#id3">3</a>]</dt>
<dd>Warth et al.
<a class="reference external" href="https://web.cs.ucla.edu/~todd/research/pepm08.pdf">https://web.cs.ucla.edu/~todd/research/pepm08.pdf</a></aside>
</aside>
<p>[4] Guido’s series on PEG parsing
  <a class="reference external" href="https://medium.com/&#64;gvanrossum_83706/peg-parsing-series-de5d41b2ed60">https://medium.com/&#64;gvanrossum_83706/peg-parsing-series-de5d41b2ed60</a></p>
</section>
<section id="copyright">
<h2><a class="toc-backref" href="#copyright" role="doc-backlink">Copyright</a></h2>
<p>This document has been placed in the public domain.</p>
</section>
</section>
<hr class="docutils" />
<p>Source: <a class="reference external" href="https://github.com/python/peps/blob/main/peps/pep-0617.rst">https://github.com/python/peps/blob/main/peps/pep-0617.rst</a></p>
<p>Last modified: <a class="reference external" href="https://github.com/python/peps/commits/main/peps/pep-0617.rst">2023-12-18 21:39:13 GMT</a></p>

        </article>
        <nav id="pep-sidebar">
            <h2>Contents</h2>
            <ul>
<li><a class="reference internal" href="#overview">Overview</a></li>
<li><a class="reference internal" href="#background-on-ll-1-parsers">Background on LL(1) parsers</a></li>
<li><a class="reference internal" href="#background-on-peg-parsers">Background on PEG parsers</a></li>
<li><a class="reference internal" href="#rationale">Rationale</a><ul>
<li><a class="reference internal" href="#some-rules-are-not-actually-ll-1">Some rules are not actually LL(1)</a></li>
<li><a class="reference internal" href="#complicated-ast-parsing">Complicated AST parsing</a></li>
<li><a class="reference internal" href="#lack-of-left-recursion">Lack of left recursion</a></li>
<li><a class="reference internal" href="#intermediate-parse-tree">Intermediate parse tree</a></li>
</ul>
</li>
<li><a class="reference internal" href="#the-new-proposed-peg-parser">The new proposed PEG parser</a><ul>
<li><a class="reference internal" href="#left-recursion">Left recursion</a></li>
<li><a class="reference internal" href="#syntax">Syntax</a><ul>
<li><a class="reference internal" href="#grammar-expressions">Grammar Expressions</a><ul>
<li><a class="reference internal" href="#comment"><code class="docutils literal notranslate"><span class="pre">#</span> <span class="pre">comment</span></code></a></li>
<li><a class="reference internal" href="#e1-e2"><code class="docutils literal notranslate"><span class="pre">e1</span> <span class="pre">e2</span></code></a></li>
<li><a class="reference internal" href="#e1-e2-1"><code class="docutils literal notranslate"><span class="pre">e1</span> <span class="pre">|</span> <span class="pre">e2</span></code></a></li>
<li><a class="reference internal" href="#e"><code class="docutils literal notranslate"><span class="pre">(</span> <span class="pre">e</span> <span class="pre">)</span></code></a></li>
<li><a class="reference internal" href="#e-or-e"><code class="docutils literal notranslate"><span class="pre">[</span> <span class="pre">e</span> <span class="pre">]</span> <span class="pre">or</span> <span class="pre">e?</span></code></a></li>
<li><a class="reference internal" href="#e-1"><code class="docutils literal notranslate"><span class="pre">e*</span></code></a></li>
<li><a class="reference internal" href="#e-2"><code class="docutils literal notranslate"><span class="pre">e+</span></code></a></li>
<li><a class="reference internal" href="#s-e"><code class="docutils literal notranslate"><span class="pre">s.e+</span></code></a></li>
<li><a class="reference internal" href="#e-3"><code class="docutils literal notranslate"><span class="pre">&amp;e</span></code></a></li>
<li><a class="reference internal" href="#e-4"><code class="docutils literal notranslate"><span class="pre">!e</span></code></a></li>
<li><a class="reference internal" href="#e-5"><code class="docutils literal notranslate"><span class="pre">~</span></code></a></li>
</ul>
</li>
<li><a class="reference internal" href="#variables-in-the-grammar">Variables in the Grammar</a></li>
</ul>
</li>
<li><a class="reference internal" href="#grammar-actions">Grammar actions</a></li>
</ul>
</li>
<li><a class="reference internal" href="#migration-plan">Migration plan</a></li>
<li><a class="reference internal" href="#performance-and-validation">Performance and validation</a><ul>
<li><a class="reference internal" href="#validation">Validation</a></li>
<li><a class="reference internal" href="#performance">Performance</a></li>
</ul>
</li>
<li><a class="reference internal" href="#rejected-alternatives">Rejected Alternatives</a></li>
<li><a class="reference internal" href="#references">References</a></li>
<li><a class="reference internal" href="#copyright">Copyright</a></li>
</ul>

            <br>
            <a id="source" href="https://github.com/python/peps/blob/main/peps/pep-0617.rst">Page Source (GitHub)</a>
        </nav>
    </section>
    <script src="../_static/colour_scheme.js"></script>
    <script src="../_static/wrap_tables.js"></script>
    <script src="../_static/sticky_banner.js"></script>
</body>
</html>