import fnmatch
import io
import json
import re
import tarfile
import urllib.parse
from dataclasses import dataclass
from pathlib import Path
from typing import IO, Any, Final, TypeAlias, cast
import lxml.html
import semver
from django.utils.text import slugify
from lxml.etree import XML
from .exc import VersionAlreadyExists
from .logging import logger
from .models import Project, SphinxDocument, SphinxImage, SphinxPage, Version
from .search_indexes import SphinxPageIndex
from .settings import EXCLUDE_FROM_LATEST
ImageMap: TypeAlias = dict[str, SphinxImage]
DocumentMap: TypeAlias = dict[str, SphinxDocument]
[docs]@dataclass
class PageTreeNode:
"""
A data structure to temporarily hold relationships between
:py:class:`sphinx_hosting.models.SphinxPage` objects while importing pages.
In the page JSON we get from Sphinx, we only know the titles of related
pages, so we store them here along with the
:py:class:`sphinx_hosting.models.SphinxPage` we created from our JSON, and
then do another pass through these :py:class:`PageTreeNode` objects to link
our pages together.
This is used in :py:meth:`SphinxPackageImporter.link_pages`.
"""
#: The page for this node
page: SphinxPage
#: The title of the parent page for this node, if any
parent_title: str | None = None
#: The title of the next page for this node, if any
next_title: str | None = None
[docs]class SphinxPackageImporter:
"""
**Usage**: ``SphinxPackageImporter().run(sphinx_tarfilename)```
Import a tarfile of a built set of Sphinx documentation into the database.
.. important::
Before importing, there must be a
:py:class:`sphinx_hosting.models.Project` in the database whose
``machine_name`` matches the ``project`` in Sphinx's ``conf.py`` config
file for the docs to be imported.
The documentation package should have been built via the ``json`` output from
``sphinx-build``, so either:
.. code-block:: bash
make json
or:
.. code-block:: bash
sphinx-build -n -b json build/json
The tarfile should be built like so:
.. code-block:: bash
cd build
tar zcf mydocs.tar.gz json
ensuring that the package contents are enclosed in a folder.
When run, :py:class:`SphinxPackageImporter` will look inside the tarfile at
the ``globalcontext.json`` file to determine which project and version we should
associate these pages with.
* The ``project`` key in will be used to look up the
:py:class:`sphinx_hosting.models.Project` to associate these Sphinx pages
with, using ``project`` as :py:attr:`sphinx_hosting.models.Project.machine_name`
* The ``version`` key will be used to create a new
:py:class:`sphinx_hosting.models.Version` object tied to that project
Once the :py:class:`sphinx_hosting.models.Version` has been created, the
pages in the tarfile will be created as
:py:class:`sphinx_hosting.models.SphinxPage` objects, and the images will be
created as :py:class:`sphinx_hosting.models.SphinxImage` objects.
"""
# Sometimes pages have weird titles -- replace them with their filename
ODD_TITLES: Final[list[str]] = ["<no title>"]
[docs] def __init__(self) -> None:
#: Used to map original Sphinx image paths to our Django storage path
self.image_map: ImageMap = {}
#: Used to map original Sphinx document paths to our Django storage path
self.document_map: DocumentMap = {}
#: Used to link pages to their parent pages, and to their next pages
self.page_tree: dict[str, PageTreeNode] = {}
self.name_map: dict[str, str] = {}
#: the contents of globalcontext.json
self.config: dict[str, Any] = {}
def _get_file(self, package: tarfile.TarFile, filename: str) -> io.BufferedReader:
"""
Look through the member names in our tarfile ``package`` for
``filename``, and return an open file descriptor on that file.
Note:
We have to do it this way instead of using ``package.getmember(name)``
because we don't know the name of the containing folder.
Args:
package: the opened Sphinx documentation tarfile
filename: the name of the file we're looking for
Raises:
KeyError: no file named ``filename`` was found in the tarfile
Returns:
The opened file descriptor for our file.
"""
if not self.name_map:
self.name_map = {
str(Path(*Path(name).parts[1:])): name for name in package.getnames()
}
if filename not in self.name_map:
msg = f'Sphinx docs TarFile has no file named "{filename}"'
raise KeyError(msg)
return cast("io.BufferedReader", package.extractfile(self.name_map[filename]))
def _update_image_src(self, body: str) -> str:
"""
Given an HTML body of a Sphinx page, update the ``<img src="path">``
references to template tag expressions that load the actual image URL
from the :py:class:`sphinx_hosting.models.SphinxImage` objects at render time.
We need to defer filling in the URL of the image until render time
because of things like storing images in S3 and using time-limited S3
auth parameters to retrieve the image from a private bucket. Those
parameters expire typically after an hour, so if we don't defer figuring
out the URL for our images, we end up storing a stale URL.
Also deal with any lightboxes by converting them to the appropriate form
to work with Tabler lightboxes.
Args:
body: the HTML body of a Sphinx document
Returns:
``body`` with its ``<img>`` urls and lightbox attributes updated
"""
if not body:
return ""
html = lxml.html.fromstring(body)
images = html.cssselect("img")
for image in images:
# The image path is relative to the Sphinx page itself, so we need to
# remove any "../" from the path so we can match the remainder to
# our image_map
src = re.sub(r"(\.\./)+", "", image.attrib["src"])
if src in self.image_map:
image.attrib["src"] = (
f"{{% sphinximage_url {self.image_map[src].id} %}}"
)
# also deal with any lightbox <a>
lightboxes = html.cssselect("a[data-lightbox]")
for lightbox in lightboxes:
lightbox.attrib["data-fslightbox"] = lightbox.attrib["data-lightbox"]
del lightbox.attrib["data-lightbox"]
if "data-title" in lightbox.attrib:
lightbox.attrib["data-caption"] = lightbox.attrib["data-title"]
del lightbox.attrib["data-title"]
src = re.sub(r"\.\./", "", lightbox.attrib["href"])
if src in self.image_map:
lightbox.attrib["href"] = (
f"{{% sphinximage_url {self.image_map[src].id} %}}"
)
return lxml.html.tostring(html).decode("utf-8")
def _update_document_href(self, body: str) -> str:
"""
Given an HTML body of a Sphinx page, update the ``<a class="reference
download internal download="" href="__the_orig_path__">`` references to
template tag expressions that load the actual document URL from the
:py:class:`sphinx_hosting.models.SphinxDocument` objects at render time.
We need to defer filling in the href of the document until render time
because of things like storing documents in S3 and using time-limited S3
auth parameters to retrieve the image from a private bucket. Those
parameters expire typically after an hour, so if we don't defer figuring
out the URL for our documents, we end up storing a stale URL.
Args:
body: the HTML body of a Sphinx document
Returns:
``body`` with its ``<a>`` hrefs updated
"""
if not body:
return ""
html = lxml.html.fromstring(body)
docs = html.cssselect("a.download")
for doc in docs:
# The document path is relative to the Sphinx page itself, so we need to
# remove any "../" from the path so we can match the remainder to
# our image_map
src = re.sub(r"(\.\./)+", "", doc.attrib["href"])
if src in self.document_map:
doc.attrib["href"] = (
f"{{% sphinxdocument_url {self.document_map[src].id} %}}"
)
return lxml.html.tostring(html).decode("utf-8")
[docs] def load_config(self, package: tarfile.TarFile) -> None:
"""
Load the ``globalcontext.json`` file for later reference.
Args:
package: the opened Sphinx documentation tarfile
"""
self.config = json.loads(self._get_file(package, "globalcontext.json").read())
[docs] def get_version(self, force: bool = False) -> Version:
"""
Look in ``package`` for a member named ``globalcontext.json``, and load
that file as JSON.
Extract these things from that JSON:
* the version string from the ``release`` key.
* the ``machine_name`` of the :py:class:`Project` for this
documentation tarfile as the slugified version of the ``project``
key
Return a new :py:class:`Version` instance on the project.
Keyword Args:
force: if ``True``, re-use an existing version, purging any docs and
images associated with it first
Raises:
Project.DoesNotExist: no :py:class:`Project` exists whose
``machine_name`` matches the slugified ``project`` setting
in the Sphinx package's ``conf.py``
VersionAlreadyExists: a :py:class:`Version` with version string
``release`` from the Sphinx ``conf.py`` already exists for our
project, and ``force`` was not ``True``
"""
machine_name = self.config["project"]
project = Project.objects.get(machine_name=machine_name)
v = project.versions.filter(version=self.config["release"]).first()
if v:
if not force:
msg = (
f"""Version {self.config["release"]} of Project(machine_name="""
f""""{machine_name}") already exists."""
)
raise VersionAlreadyExists(msg)
v.pages.all().delete()
v.images.all().delete()
v.sphinx_version = self.config["sphinx_version"]
v.head = None
else:
v = Version(
project=project,
version=self.config["release"],
sphinx_version=self.config["sphinx_version"],
)
v.save()
return v
[docs] def import_documents(self, package: tarfile.TarFile, version: Version) -> None:
"""
Import all downloadable documents in our Sphinx documentation into the
database (and our Django storage) before importing any pages.
Args:
package: the opened Sphinx documentation tarfile
version: the :py:class:`Version` which which to associate our documents
"""
for member in package.getmembers():
if not member.isfile():
continue
path = Path(*Path(member.name).parts[1:])
if path.suffix == "":
continue
if path.match("_downloads/*/*") and not path.name.startswith("."):
fd = package.extractfile(member)
orig_path: str = str(path)
doc = SphinxDocument(version=version, orig_path=orig_path)
doc.file.save(path.name, fd)
doc.save()
self.document_map[orig_path] = doc
logger.info(
"%s.document.imported project=%s version=%s orig_path=%s "
"url=%s id=%s",
self.__class__.__name__,
version.project.machine_name, # type: ignore[attr-defined]
version.version,
doc.orig_path,
doc.file.url,
doc.id,
)
[docs] def import_images(self, package: tarfile.TarFile, version: Version) -> None:
"""
Import all images in our Sphinx documentation into the database before
importing any pages, then return a lookup dict for doing ``<img
src="image_path">`` replacements in the page bodies.
Args:
package: the opened Sphinx documentation tarfile
version: the :py:class:`Version` which which to associate our images
"""
for member in package.getmembers():
if not member.isfile():
continue
path = Path(*Path(member.name).parts[1:])
if path.match("_images/*") and not path.name.startswith("."):
fd = package.extractfile(member)
orig_path: str = str(path)
image = SphinxImage(version=version, orig_path=orig_path)
image.file.save(orig_path, fd)
image.save()
self.image_map[orig_path] = image
logger.info(
"%s.image.imported project=%s version=%s orig_path=%s url=%s id=%s",
self.__class__.__name__,
version.project.machine_name, # type: ignore[attr-defined]
version.version,
image.orig_path,
image.file.url,
image.id,
)
def _fix_page_title(self, path: str, data: dict[str, Any]) -> None:
"""
Ensure that there is a ``title`` key in ``data``, the JSON data from our
.fjson file. Some special pages don't have a ``title`` key in their
JSON data, so we supply one based on their filename, or by copying
another key from ``data``.
Args:
path: the file path in the tarfile data: the JSON data from our file
data: the JSON data from our ``.fjson`` file
"""
if "title" not in data:
data["title"] = "UNKNOWN"
if path in SphinxPage.SPECIAL_PAGES:
data["title"] = SphinxPage.SPECIAL_PAGES[path]
if data["title"] in self.ODD_TITLES:
data["title"] = path
if "title" not in data:
# Some of the special pages don't have 'title' keys
if "indextitle" in data:
data["title"] = data["indextitle"]
else:
data["title"] = SphinxPage.SPECIAL_PAGES[path]
def _fix_link_hrefs(self, path: str, body: str) -> str:
"""
Given an HTML body of a Sphinx page, update the ``<a href="path">``
references for "path" to be rendered at page render time. If we don't
do this, a lot of links won't work, because they do
index page, instead of being relative to the root of the docs, and won't
work.
Args:
path: the path to the current page
body: the HTML body of a Sphinx document
Returns:
``body`` with its ``<a>`` urls and updated
"""
if not body:
return ""
# Parse the HTML body into an lxml tree
html = lxml.html.fromstring(body)
# Find all internal references except for download directives
links = html.cssselect("a.reference.internal:not(.download)")
# For each link, update its URL to be rendered at page render time
for link in links:
href = link.attrib["href"]
anchor = None
if "#" in href:
href, anchor = href.split("#")
href = href.removesuffix("/")
# To deal with relative links, we need to know our current path
# and then compute the absolute path from that.
levels = href.count("../")
if levels:
href = re.sub("^(../)*", "", href)
href = "/".join([*path.split("/")[:-levels], href])
link.attrib["href"] = (
"{{% url 'sphinx_hosting:sphinxpage--detail' project_slug='{}' version='{}' path='{}' %}}".format( # noqa:E501 # pylint: disable=line-too-long
self.config["project"], self.config["release"], href
)
)
if anchor:
link.attrib["href"] += f"#{anchor}"
# Return the updated HTML body
return lxml.html.tostring(html).decode("utf-8")
def _fix_page_body(self, path: str, data: dict[str, Any]) -> None:
"""
Do any work needed to prepare the page body before inserting into the
database. This means:
* Ensure the ``body`` key exists in ``data``
* Update the ``img`` sources to point to our Django storage location.
We uploaded them to our storage during :py:meth:`import_images`.
* Update the ``href``s for any ``<a>`` links to be rendered at page
render time.
* Update the ``<table>``s to have the CSS classes we need for them to
display nicely.
Args:
path: the path to the current page
data: the JSON data from our file
"""
if "body" not in data or data["body"] is None:
# Ensure we always have data['body'] defined as a string, for when
# we create the SphinxPage, below
data["body"] = ""
data["orig_body"] = data["body"]
if data["body"]:
# Update the img src for any images in data['body'] for to point to our
# Django storage locations
data["body"] = self._update_image_src(data["body"])
# Update the img src for any images in data['body'] for to point to our
# Django storage locations
data["body"] = self._update_document_href(data["body"])
# Update the hrefs for any <a> links to be absolute. The relative
# paths we get from Sphinx end up being relative to the Sphinx index
# document instead of to the root of the docs
data["body"] = self._fix_link_hrefs(path, data["body"])
html = lxml.html.fromstring(data["body"])
# remove the first <h1> -- we'll display the page title another way
first_h1 = html.cssselect("h1")
if first_h1:
first_h1[0].getparent().remove(first_h1[0])
# Fix our tables to look better
tables = html.cssselect("table")
for table in tables:
wrapper = XML('<div class="table-responsive"></div>')
parent = table.getparent()
parent.append(wrapper)
wrapper.insert(0, table)
table.classes.add("table")
table.classes.add("table-striped")
table.classes.add("border")
for tr in table.cssselect("thead > tr"):
tr.classes.discard("row-even")
tr.classes.discard("row-odd")
for tr in table.cssselect("th"):
tr.classes.discard("head")
tr.classes.add("p-2")
for tr in table.cssselect("tbody > tr"):
tr.classes.discard("row-even")
tr.classes.discard("row-odd")
for div in table.cssselect("tbody > tr div.line"):
div.classes.discard("line")
div.classes.add("text-start")
for div in table.cssselect("tbody > tr p"):
div.classes.add("text-start")
data["body"] = lxml.html.tostring(html).decode("utf-8")
# Unescape our template tags after lxml has converted our {% %}
# to entities.
tags = [m.group() for m in re.finditer(r"{%%20.*?%20%}", data["body"])]
tags.extend(
[m.group() for m in re.finditer(r"%7B%%20.*?%20%%7D", data["body"])]
)
for tag in tags:
data["body"] = data["body"].replace(tag, urllib.parse.unquote(tag))
# Convert the weird paragraph symbols to actual paragraph symbols
data["body"] = re.sub(r"#61633;", r"para;", data["body"])
def _fix_toc(self, data: dict[str, Any]) -> None:
"""
Update our page's local table of contents (``data['toc']`) to have the CSS
classes we need in order for it to display properly.
Args:
data: the decoded JSON data of the sphinx page
"""
if "toc" not in data:
return
data["orig_toc"] = data["toc"]
html = lxml.html.fromstring(data["toc"])
ul_first = html.cssselect("ul:first-child")[0]
# Turn the first <ul> into a tabler vertical nav
ul_first.classes.add("nav-vertical")
# Turn all <uls> into nav-pills and nav
for ul in html.cssselect("ul"):
ul.classes.add("nav")
ul.classes.add("nav-pills")
# Make all list items into nav-items
for li in html.cssselect("li"):
li.classes.add("nav-item")
# Make <a> into nav-links
for link in html.cssselect("a"):
link.classes.add("nav-link")
# Now make the embedded uls collapsable
for ul in html.cssselect("li > ul"):
wrapper = XML(
'<div class="d-flex flex-row justify-content-between align-items-center"></div>' # noqa: E501
)
link = ul.getprevious()
link.addprevious(wrapper)
wrapper.insert(0, link)
# Use Django's slugify to sanitize link.text_content() so it is safe
# for use as an HTML id. This removes unsafe characters and
# replaces spaces and punctuation with dashes.
target = f"menu-{slugify(link.text_content() or '')}"
wrapper.append(
# semgrep-reason:
# target is constructed by us using Django's slugify, so
# it's safe to use in the XML string.
XML( # nosemgrep
'<a class="toc__toggle nav-link-toggle" data-bs-toggle="collapse" '
f'aria-expanded="false" data-bs-target="#{target}"></a>'
)
)
ul.attrib["id"] = target
ul.classes.add("collapse")
try:
link = html.cssselect("a:nth-child(2)")[0]
link.attrib["aria-expanded"] = "true"
except IndexError:
pass
try:
ul = html.cssselect("li:first-child ul")[0]
ul.classes.add("show")
except IndexError:
pass
data["toc"] = lxml.html.tostring(html).decode("utf-8")
def _update_page_tree(self, page: SphinxPage, data: dict[str, Any]) -> None:
"""
Update :py:attr:`page_tree`, our page linkage tree, with ``page``,
which we will use in :py:meth:`link_pages` to set
:py:attr:`SphinxPage.parent` and :py:attr:`SphinxPage.next_page`
appropriately.
Args:
page: the :py:class:`sphinx_hosting.models.SphinxPage` object we created
data: the JSON data from our page file
"""
parent: str | None = None
if data.get("parents"):
parent = [p["title"] for p in data["parents"]][-1]
next_title: str | None = None
if data.get("next"):
next_title = data["next"]["title"]
self.page_tree[page.title] = PageTreeNode(
page=page, parent_title=parent, next_title=next_title
)
[docs] def import_pages(self, package: tarfile.TarFile, version: Version) -> None:
"""
Import a all pages from ``package`` into the database as
:py:class:`sphinx_hosting.models.SphinxPage` objects, associating them
with :py:class:`Version` ``version``.
Args:
package: the tarfile of the sphinx docs
version: the :py:class:`Version` object to associated data
Returns:
The page linkage tree for consumption by :py:meth:`link_pages`.
"""
for member in package.getmembers():
path: str = str(Path(*Path(member.name).parts[1:]))
if path.split("/")[-1].startswith("._"):
# This is a Mac OS X AppleDouble hidden file. Ignore it and
# move on. It just has MacOS specific metadata we don't care
# about.
continue
if path.endswith(".fjson"):
# files that contain page data will have a .fjson extension
path = path.replace(".fjson", "")
fd = cast("io.BufferedReader", package.extractfile(member))
data = json.loads(fd.read())
self._fix_page_title(path, data)
self._fix_page_body(path, data)
self._fix_toc(data)
page = SphinxPage(
version=version,
relative_path=path,
content=json.dumps(data),
title=data["title"],
orig_body=data["orig_body"],
body=data["body"],
orig_local_toc=data.get("orig_toc", None),
local_toc=data.get("toc", None),
orig_global_toc=data.get("globaltoc", None),
)
page.save()
self._update_page_tree(page, data)
logger.info(
"%s.page.imported project=%s version=%s relpath=%s title=%s id=%s",
self.__class__.__name__,
version.project.machine_name, # type: ignore[attr-defined]
version.version,
page.relative_path,
page.title,
page.id,
)
[docs] def link_pages(self) -> None:
"""
Given :py:attr:`page_tree``, a list of page linkages (parent, next,
prev), link all the :py:class:`sphinx_hosting.models.SphinxPage` objects
in that list to their next page and their parent page.
Args:
tree: the page linkage tree
"""
for link in self.page_tree.values():
page = link.page
logger.info(
"%s.page.linking relpath=%s title=%s id=%s",
self.__class__.__name__,
page.relative_path,
page.title,
page.id,
)
if link.parent_title:
page.parent = self.page_tree[link.parent_title].page
logger.info(
"%s.page.linked-parent relpath=%s title=%s parent=%s",
self.__class__.__name__,
page.relative_path,
page.title,
page.parent.title,
)
if link.next_title:
page.next_page = self.page_tree[link.next_title].page
logger.info(
"%s.page.linked-next relpath=%s title=%s next=%s",
self.__class__.__name__,
page.relative_path,
page.title,
page.next_page.title,
)
page.save()
[docs] def run(
self,
filename: str | None = None,
file_obj: IO | None = None,
force: bool = False,
) -> Version:
"""
Load the pages in the tarfile identified by ``filename`` into
the database as :py:class:`Version` ``version`` of :py:class:`Project`
``project``. See the class docs for :py:class:`SphinxPackageImporter` for
more background on how to prepare the package named by ``filename``.
Keyword Args:
filename: the filename of the gzipped tar archive of the Sphinx pages
file_obj: an open file object of the gzipped tar archive of the Sphinx pages
force: if ``True``, overwrite the docs for an existing version
Raises:
Project.DoesNotExist: no :py:class:`Project` exists whose
``machine_name`` matches the slugified ``project`` setting
in the Sphinx package's ``conf.py``
VersionAlreadyExists: a :py:class:`Version` with version string
``release`` from the Sphinx package's ``conf.py``
already exists for our project, and ``force`` was not ``True``
"""
assert not all( # noqa: S101
[filename, file_obj]
), 'provide either "filename" or "file_obj" but not both'
with tarfile.open(name=filename, fileobj=file_obj) as package:
self.load_config(package)
version = self.get_version(force=force)
self.import_images(package, version)
self.import_documents(package, version)
self.import_pages(package, version)
self.link_pages()
# Point version.head at the top page of the documentation set
version.head = SphinxPage.objects.get(
version=version, relative_path=self.config["root_doc"]
)
version.save()
# Mark the appropriate pages as indexable
version.mark_searchable_pages()
project = version.project
changed: bool = False
if project.latest_version is None: # type: ignore[attr-defined]
project.latest_version = version # type: ignore[attr-defined]
project.save() # type: ignore[attr-defined]
changed = True
elif not any(
fnmatch.fnmatch(version.version, glob) for glob in EXCLUDE_FROM_LATEST
):
if semver.compare(project.latest_version.version, version.version) < 0: # type: ignore[attr-defined]
# The new version is greater than the current latest version,
# so update the latest version to be this new version
SphinxPageIndex().remove_version(project.latest_version) # type: ignore[attr-defined]
project.latest_version = version # type: ignore[attr-defined]
project.save() # type: ignore[attr-defined]
changed = True
if changed:
# Reindex the project. We do this here because we want to reindex the
# update the "is_latest" flag on all pages in all versions of the
# project in case this is now the latest version.
#
# In the logs you may see two reindexing events for the same
# project. The first
SphinxPageIndex().reindex_project(cast(Project, version.project))
return version