Generate ``peps.rss`` via Sphinx (#3274)

2023-08-05 17:22:22 +01:00 · 2023-08-05 17:22:22 +01:00 · 2b53c224d1
parent 7b86f6deb0
commit 2b53c224d1
6 changed files with 135 additions and 224 deletions
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -16,7 +16,6 @@ AUTHOR_OVERRIDES.csv    @AA-Turner
 build.py                @AA-Turner
 conf.py                 @AA-Turner
 contents.rst            @AA-Turner
 generate_rss.py         @AA-Turner
 # Linting infrastructure
 .codespell/             @CAM-Gerlach @hugovk
--- a/7
+++ b/7
@ -27,7 +27,7 @@ htmlview: html
 ## dirhtml        to render PEPs to "index.html" files within "pep-NNNN" directories
 .PHONY: dirhtml
 dirhtml: BUILDER = dirhtml
-dirhtml: venv rss
+dirhtml: venv
 	$(SPHINXBUILD) $(ALLSPHINXOPTS)
 ## fail-warning   to render PEPs to "pep-NNNN.html" files and fail the Sphinx build on any warning
@ -41,11 +41,6 @@ check-links: BUILDER = linkcheck
 check-links: venv
 	$(SPHINXBUILD) $(ALLSPHINXOPTS)
 ## rss            to generate the peps.rss file
 .PHONY: rss
 rss: venv
 	$(VENVDIR)/bin/python3 generate_rss.py -o $(OUTPUT_DIR)
 ## clean          to remove the venv and build files
 .PHONY: clean
 clean: clean-venv
--- a/generate_rss.py
+++ b/generate_rss.py
@ -1,210 +0,0 @@
 #!/usr/bin/env python3
 # This file is placed in the public domain or under the
 # CC0-1.0-Universal license, whichever is more permissive.
 import argparse
 import datetime as dt
 import email.utils
 from html import escape
 from pathlib import Path
 import re
 import docutils.frontend
 from docutils import nodes
 from docutils import utils
 from docutils.parsers import rst
 from docutils.parsers.rst import roles
 # get the directory with the PEP sources
 PEP_ROOT = Path(__file__).parent
 def _format_rfc_2822(datetime: dt.datetime) -> str:
    datetime = datetime.replace(tzinfo=dt.timezone.utc)
    return email.utils.format_datetime(datetime, usegmt=True)
 line_cache: dict[Path, dict[str, str]] = {}
 # Monkeypatch PEP and RFC reference roles to match Sphinx behaviour
 EXPLICIT_TITLE_RE = re.compile(r'^(.+?)\s*(?<!\x00)<(.*?)>$', re.DOTALL)
 def _pep_reference_role(role, rawtext, text, lineno, inliner,
                        options={}, content=[]):
    matched = EXPLICIT_TITLE_RE.match(text)
    if matched:
        title = utils.unescape(matched.group(1))
        target = utils.unescape(matched.group(2))
    else:
        target = utils.unescape(text)
        title = "PEP " + utils.unescape(text)
    pep_str, _, fragment = target.partition("#")
    try:
        pepnum = int(pep_str)
        if pepnum < 0 or pepnum > 9999:
            raise ValueError
    except ValueError:
        msg = inliner.reporter.error(
            f'PEP number must be a number from 0 to 9999; "{pep_str}" is invalid.',
            line=lineno)
        prb = inliner.problematic(rawtext, rawtext, msg)
        return [prb], [msg]
    # Base URL mainly used by inliner.pep_reference; so this is correct:
    ref = (inliner.document.settings.pep_base_url
           + inliner.document.settings.pep_file_url_template % pepnum)
    if fragment:
        ref += "#" + fragment
    roles.set_classes(options)
    return [nodes.reference(rawtext, title, refuri=ref, **options)], []
 def _rfc_reference_role(role, rawtext, text, lineno, inliner,
                        options={}, content=[]):
    matched = EXPLICIT_TITLE_RE.match(text)
    if matched:
        title = utils.unescape(matched.group(1))
        target = utils.unescape(matched.group(2))
    else:
        target = utils.unescape(text)
        title = "RFC " + utils.unescape(text)
    pep_str, _, fragment = target.partition("#")
    try:
        rfcnum = int(pep_str)
        if rfcnum < 0 or rfcnum > 9999:
            raise ValueError
    except ValueError:
        msg = inliner.reporter.error(
            f'RFC number must be a number from 0 to 9999; "{pep_str}" is invalid.',
            line=lineno)
        prb = inliner.problematic(rawtext, rawtext, msg)
        return [prb], [msg]
    ref = (inliner.document.settings.rfc_base_url + inliner.rfc_url % rfcnum)
    if fragment:
        ref += "#" + fragment
    roles.set_classes(options)
    return [nodes.reference(rawtext, title, refuri=ref, **options)], []
 roles.register_canonical_role("pep-reference", _pep_reference_role)
 roles.register_canonical_role("rfc-reference", _rfc_reference_role)
 def first_line_starting_with(full_path: Path, text: str) -> str:
    # Try and retrieve from cache
    if full_path in line_cache:
        return line_cache[full_path].get(text, "")
    # Else read source
    line_cache[full_path] = path_cache = {}
    for line in full_path.open(encoding="utf-8"):
        if line.startswith("Created:"):
            path_cache["Created:"] = line.removeprefix("Created:").strip()
        elif line.startswith("Title:"):
            path_cache["Title:"] = line.removeprefix("Title:").strip()
        elif line.startswith("Author:"):
            path_cache["Author:"] = line.removeprefix("Author:").strip()
        # Once all have been found, exit loop
        if path_cache.keys == {"Created:", "Title:", "Author:"}:
            break
    return path_cache.get(text, "")
 def pep_creation(full_path: Path) -> dt.datetime:
    created_str = first_line_starting_with(full_path, "Created:")
    if full_path.stem == "pep-0102":
        # remove additional content on the Created line
        created_str = created_str.split(" ", 1)[0]
    return dt.datetime.strptime(created_str, "%d-%b-%Y")
 def parse_rst(full_path: Path) -> nodes.document:
    text = full_path.read_text(encoding="utf-8")
    settings = docutils.frontend.get_default_settings(rst.Parser)
    document = utils.new_document(f'<{full_path}>', settings=settings)
    rst.Parser(rfc2822=True).parse(text, document)
    return document
 def pep_abstract(full_path: Path) -> str:
    """Return the first paragraph of the PEP abstract"""
    for node in parse_rst(full_path).findall(nodes.section):
        if node.next_node(nodes.title).astext() == "Abstract":
            return node.next_node(nodes.paragraph).astext().strip().replace("\n", " ")
    return ""
 def main():
    parser = argparse.ArgumentParser(description="Generate RSS feed")
    parser.add_argument(
        "-o",
        "--output-dir",
        default="build",  # synchronise with render.yaml -> deploy step
        help="Output directory, relative to root. Default 'build'.",
    )
    args = parser.parse_args()
    # get list of peps with creation time (from "Created:" string in pep source)
    peps_with_dt = sorted((pep_creation(path), path) for path in PEP_ROOT.glob("pep-????.???"))
    # generate rss items for 10 most recent peps
    items = []
    for datetime, full_path in peps_with_dt[-10:]:
        try:
            pep_num = int(full_path.stem.split("-")[-1])
        except ValueError:
            continue
        title = first_line_starting_with(full_path, "Title:")
        author = first_line_starting_with(full_path, "Author:")
        if "@" in author or " at " in author:
            parsed_authors = email.utils.getaddresses([author])
            joined_authors = ", ".join(f"{name} ({email_address})" for name, email_address in parsed_authors)
        else:
            joined_authors = author
        url = f"https://peps.python.org/pep-{pep_num:0>4}/"
        item = f"""\
    <item>
      <title>PEP {pep_num}: {escape(title, quote=False)}</title>
      <link>{escape(url, quote=False)}</link>
      <description>{escape(pep_abstract(full_path), quote=False)}</description>
      <author>{escape(joined_authors, quote=False)}</author>
      <guid isPermaLink="true">{url}</guid>
      <pubDate>{_format_rfc_2822(datetime)}</pubDate>
    </item>"""
        items.append(item)
    # The rss envelope
    desc = """
    Newest Python Enhancement Proposals (PEPs) - Information on new
    language features, and some meta-information like release
    procedure and schedules.
    """
    last_build_date = _format_rfc_2822(dt.datetime.now(dt.timezone.utc))
    items = "\n".join(reversed(items))
    output = f"""\
 <?xml version='1.0' encoding='UTF-8'?>
 <rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
  <channel>
    <title>Newest Python PEPs</title>
    <link>https://peps.python.org/peps.rss</link>
    <description>{" ".join(desc.split())}</description>
    <atom:link href="https://peps.python.org/peps.rss" rel="self"/>
    <docs>https://cyber.harvard.edu/rss/rss.html</docs>
    <language>en</language>
    <lastBuildDate>{last_build_date}</lastBuildDate>
 {items}
  </channel>
 </rss>
 """
    # output directory for target HTML files
    out_dir = PEP_ROOT / args.output_dir
    out_dir.mkdir(exist_ok=True, parents=True)
    out_dir.joinpath("peps.rss").write_text(output)
 if __name__ == "__main__":
    main()
--- a/pep_sphinx_extensions/init.py
+++ b/pep_sphinx_extensions/init.py
@ -7,6 +7,7 @@ from typing import TYPE_CHECKING
 from docutils.writers.html5_polyglot import HTMLTranslator
 from sphinx import environment
 from pep_sphinx_extensions.generate_rss import create_rss_feed
 from pep_sphinx_extensions.pep_processor.html import pep_html_builder
 from pep_sphinx_extensions.pep_processor.html import pep_html_translator
 from pep_sphinx_extensions.pep_processor.parsing import pep_banner_directive
@ -29,9 +30,7 @@ def _update_config_for_builder(app: Sphinx) -> None:
    if app.builder.name == "dirhtml":
        app.env.settings["pep_url"] = "pep-{:0>4}"
-    # internal_builder exists if Sphinx is run by build.py
+    app.connect("build-finished", _post_build)  # Post-build tasks
    if "internal_builder" not in app.tags:
        app.connect("build-finished", _post_build)  # Post-build tasks
 def _post_build(app: Sphinx, exception: Exception | None) -> None:
@ -41,7 +40,11 @@ def _post_build(app: Sphinx, exception: Exception | None) -> None:
    if exception is not None:
        return
-    create_index_file(Path(app.outdir), app.builder.name)
+
    # internal_builder exists if Sphinx is run by build.py
    if "internal_builder" not in app.tags:
        create_index_file(Path(app.outdir), app.builder.name)
    create_rss_feed(app.doctreedir, app.outdir)
 def setup(app: Sphinx) -> dict[str, bool]:
--- a/pep_sphinx_extensions/generate_rss.py
+++ b/pep_sphinx_extensions/generate_rss.py
@ -0,0 +1,120 @@
 # This file is placed in the public domain or under the
 # CC0-1.0-Universal license, whichever is more permissive.
 from __future__ import annotations
 import datetime as dt
 import pickle
 from email.utils import format_datetime, getaddresses
 from html import escape
 from pathlib import Path
 from docutils import nodes
 RSS_DESCRIPTION = (
    "Newest Python Enhancement Proposals (PEPs): "
    "Information on new language features "
    "and some meta-information like release procedure and schedules."
 )
 # get the directory with the PEP sources
 PEP_ROOT = Path(__file__).parent
 def _format_rfc_2822(datetime: dt.datetime) -> str:
    datetime = datetime.replace(tzinfo=dt.timezone.utc)
    return format_datetime(datetime, usegmt=True)
 document_cache: dict[Path, dict[str, str]] = {}
 def get_from_doctree(full_path: Path, text: str) -> str:
    # Try and retrieve from cache
    if full_path in document_cache:
        return document_cache[full_path].get(text, "")
    # Else load doctree
    document = pickle.loads(full_path.read_bytes())
    # Store the headers (populated in the PEPHeaders transform)
    document_cache[full_path] = path_cache = document.get("headers", {})
    # Store the Abstract
    path_cache["Abstract"] = pep_abstract(document)
    # Return the requested key
    return path_cache.get(text, "")
 def pep_creation(full_path: Path) -> dt.datetime:
    created_str = get_from_doctree(full_path, "Created")
    try:
        return dt.datetime.strptime(created_str, "%d-%b-%Y")
    except ValueError:
        return dt.datetime.min
 def pep_abstract(document: nodes.document) -> str:
    """Return the first paragraph of the PEP abstract"""
    for node in document.findall(nodes.section):
        title_node = node.next_node(nodes.title)
        if title_node is None:
            continue
        if title_node.astext() == "Abstract":
            return node.next_node(nodes.paragraph).astext().strip().replace("\n", " ")
    return ""
 def _generate_items(doctree_dir: Path):
    # get list of peps with creation time (from "Created:" string in pep source)
    peps_with_dt = sorted((pep_creation(path), path) for path in doctree_dir.glob("pep-????.doctree"))
    # generate rss items for 10 most recent peps (in reverse order)
    for datetime, full_path in reversed(peps_with_dt[-10:]):
        try:
            pep_num = int(get_from_doctree(full_path, "PEP"))
        except ValueError:
            continue
        title = get_from_doctree(full_path, "Title")
        url = f"https://peps.python.org/pep-{pep_num:0>4}/"
        abstract = get_from_doctree(full_path, "Abstract")
        author = get_from_doctree(full_path, "Author")
        if "@" in author or " at " in author:
            parsed_authors = getaddresses([author])
            joined_authors = ", ".join(f"{name} ({email_address})" for name, email_address in parsed_authors)
        else:
            joined_authors = author
        item = f"""\
    <item>
      <title>PEP {pep_num}: {escape(title, quote=False)}</title>
      <link>{escape(url, quote=False)}</link>
      <description>{escape(abstract, quote=False)}</description>
      <author>{escape(joined_authors, quote=False)}</author>
      <guid isPermaLink="true">{url}</guid>
      <pubDate>{_format_rfc_2822(datetime)}</pubDate>
    </item>"""
        yield item
 def create_rss_feed(doctree_dir: Path, output_dir: Path):
    # The rss envelope
    last_build_date = _format_rfc_2822(dt.datetime.now(dt.timezone.utc))
    items = "\n".join(_generate_items(Path(doctree_dir)))
    output = f"""\
 <?xml version='1.0' encoding='UTF-8'?>
 <rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:content="http://purl.org/rss/1.0/modules/content/" version="2.0">
  <channel>
    <title>Newest Python PEPs</title>
    <link>https://peps.python.org/peps.rss</link>
    <description>{RSS_DESCRIPTION}</description>
    <atom:link href="https://peps.python.org/peps.rss" rel="self"/>
    <docs>https://cyber.harvard.edu/rss/rss.html</docs>
    <language>en</language>
    <lastBuildDate>{last_build_date}</lastBuildDate>
 {items}
  </channel>
 </rss>
 """
    # output directory for target HTML files
    Path(output_dir, "peps.rss").write_text(output, encoding="utf-8")
--- a/pep_sphinx_extensions/pep_processor/transforms/pep_headers.py
+++ b/pep_sphinx_extensions/pep_processor/transforms/pep_headers.py
@ -72,11 +72,11 @@ class PEPHeaders(transforms.Transform):
            raise PEPParsingError("Document does not contain an RFC-2822 'PEP' header!")
        # Extract PEP number
-        value = pep_field[1].astext()
+        pep_num_str = pep_field[1].astext()
        try:
-            pep_num = int(value)
+            pep_num = int(pep_num_str)
        except ValueError:
-            raise PEPParsingError(f"'PEP' header must contain an integer. '{value}' is invalid!")
+            raise PEPParsingError(f"PEP header must contain an integer. '{pep_num_str}' is invalid!")
        # Special processing for PEP 0.
        if pep_num == 0:
@ -89,7 +89,11 @@ class PEPHeaders(transforms.Transform):
            raise PEPParsingError("No title!")
        fields_to_remove = []
        self.document["headers"] = headers = {}
        for field in header:
            row_attributes = {sub.tagname: sub.rawsource for sub in field}
            headers[row_attributes["field_name"]] = row_attributes["field_body"]
            name = field[0].astext().lower()
            body = field[1]
            if len(body) == 0: