Skip to content

Commit

Permalink
Refactor chapter file processing
Browse files Browse the repository at this point in the history
This refactor gets us data-typing for non-chapters (e.g., prefaces, but
mostly appendixes) with subparts for free, and also cleans the code up.
We move the logic to apply the chapter-wide data-types to its own
function and allow the (new/old) process_chapter_soup function to take
either a Path (single-file chapter) or a list (a chapter with subfiles).

**NOTE**: We have to import `Union` from `typing` to support hinting for
this; Python 3.10 gives us a much cleaner `|` operator, but Atlas
currently is on 3.9.

Update: parallelism in the initial if-check in process_chapter_soup
  • Loading branch information
delfanbaum committed Dec 22, 2022
1 parent 35465f6 commit 5bd48b3
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 60 deletions.
103 changes: 48 additions & 55 deletions jupyter_book_to_htmlbook/file_processing.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import logging
import re
from pathlib import Path
from typing import Union
from bs4 import BeautifulSoup # type: ignore
from .admonition_processing import process_admonitions
from .figure_processing import process_figures, process_informal_figs
Expand Down Expand Up @@ -83,9 +84,11 @@ def promote_headings(chapter):
return chapter


def process_chapter_single_file(toc_element):
""" single-file chapter processing """
ch_name = toc_element.stem
def apply_datatype(chapter, ch_name):
"""
Does a best-guess application of a data-type based on file name.
"""
ch_stub = re.sub('[^a-zA-Z]', '', ch_name)

# list of front and back matter guessed-at filenames
front_matter = ['preface', 'notation', 'prereqs',
Expand All @@ -98,25 +101,6 @@ def process_chapter_single_file(toc_element):
"afterword", "conclusion", 'foreword',
'introduction', 'preface']

with open(toc_element, 'r') as f:
base_soup = BeautifulSoup(f, 'lxml')

# perform initial swapping and namespace designation
try:
chapter = base_soup.find_all('section')[0]
chapter['xmlns'] = 'http://www.w3.org/1999/xhtml' # type: ignore

except IndexError: # does not have a section class for top-level
logging.warning("Looks like {toc_element.name} is malformed.")
return None, None

# promote headings
chapter = promote_headings(chapter)

# apply appropriate data-type (best guess)

ch_stub = re.sub('[^a-zA-Z]', '', ch_name)

if ch_stub.lower() in front_matter or ch_name in front_matter:
if ch_stub.lower() in allowed_data_types:
chapter['data-type'] = ch_stub.lower() # type: ignore
Expand All @@ -133,6 +117,45 @@ def process_chapter_single_file(toc_element):
chapter['data-type'] = 'chapter' # type: ignore
del chapter['class'] # type: ignore

return chapter


def process_chapter_soup(toc_element: Union[Path, list[Path]]):
""" unified file chapter processing """

if isinstance(toc_element, list): # i.e., an ordered list of chapter parts
chapter_file = toc_element[0]
chapter_parts = toc_element[1:]
else:
chapter_file = toc_element
chapter_parts = None

ch_name = chapter_file.stem

with open(chapter_file, 'r') as f:
base_soup = BeautifulSoup(f, 'lxml')

# perform initial swapping and namespace designation
try:
chapter = base_soup.find_all('section')[0]
chapter['xmlns'] = 'http://www.w3.org/1999/xhtml' # type: ignore
del chapter['class']

except IndexError: # does not have a section class for top-level
logging.warning("Looks like {toc_element.name} is malformed.")
return None, None

# promote subheadings within "base" chapter
chapter = promote_headings(chapter)

if chapter_parts:
for subfile in chapter_parts:
subsection = process_chapter_subparts(subfile)
chapter.append(subsection)

# apply appropriate data-type (best guess)
chapter = apply_datatype(chapter, ch_name)

return chapter, ch_name


Expand All @@ -155,29 +178,6 @@ def process_chapter_subparts(subfile):
return section


def compile_chapter_parts(ordered_chapter_files_list):
"""
Takes a list of chapter file URIs and returns a basic, sectioned
chapter soup (i.e., with no other htmlbook optimizations)
"""
# work with main file
base_chapter_file = ordered_chapter_files_list[0]
with open(base_chapter_file, 'r') as f:
base_soup = BeautifulSoup(f, 'lxml')
sections = base_soup.find_all('section')
chapter = sections[0] # first section is the "main" section
chapter['data-type'] = 'chapter' # type: ignore
chapter['xmlns'] = 'http://www.w3.org/1999/xhtml' # type: ignore
del chapter['class'] # type: ignore

# work with subfiles
for subfile in ordered_chapter_files_list[1:]:
subsection = process_chapter_subparts(subfile)
chapter.append(subsection) # type: ignore

return chapter


def process_chapter(toc_element,
source_dir,
build_dir=Path('.'),
Expand All @@ -189,18 +189,11 @@ def process_chapter(toc_element,
that the files are in some /html/ directory or some such
"""

if isinstance(toc_element, Path): # single-file chapter
chapter, ch_name = process_chapter_single_file(toc_element)

# if the file happens to be bad, just return (logged elsewhere)
if chapter is None or ch_name is None:
return
chapter, ch_name = process_chapter_soup(toc_element)

else: # i.e., an ordered list of chapter parts
chapter = compile_chapter_parts(toc_element)
ch_name = toc_element[0].stem
if not chapter: # guard against malformed files
return

# see where we're at
logging.info(f"Processing {ch_name}...")

# perform cleans and processing
Expand Down
42 changes: 37 additions & 5 deletions tests/test_file_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,8 @@
import pytest
import shutil
from jupyter_book_to_htmlbook.file_processing import (
compile_chapter_parts,
process_chapter,
process_chapter_single_file
process_chapter_soup
)


Expand All @@ -22,11 +21,11 @@ def test_compile_chapter_parts_happy_path(self, tmp_path):
shutil.copytree('tests/example_book/_build/html/notebooks',
test_env, dirs_exist_ok=True)

result = compile_chapter_parts([
result = process_chapter_soup([
test_env / 'ch02.00.html',
test_env / 'ch02.01.html',
test_env / 'ch02.02.html',
])
])[0]
# the resulting section should have a data-type of "chapter"
assert result["data-type"] == "chapter"
# number of level-1 subsections should be one less than the group
Expand Down Expand Up @@ -93,7 +92,7 @@ def test_chapter_promote_headings(self, tmp_path, caplog):
<a class="headerlink" href="#summary" title="Permalink to this headline">#</a>
</h2>
<p>Finally, a summary.</p></section></section>""")
result = process_chapter_single_file(tmp_path / 'ch.html')[0]
result = process_chapter_soup(tmp_path / 'ch.html')[0]
assert str(result) == """<section data-type="chapter" id="this""" + \
"""-is-another-subheading" xmlns="http:/""" + \
"""/www.w3.org/1999/xhtml">
Expand Down Expand Up @@ -217,6 +216,7 @@ def test_process_chapter_no_section(self, tmp_path):
text = f.read()
assert text.find('xmlns="http://www.w3.org/1999/xhtml"') > -1

@pytest.mark.in_dev
def test_process_chapter_totally_invalid_file(self, tmp_path, caplog):
"""
if we ever try to process something that's super malformed, don't,
Expand Down Expand Up @@ -330,3 +330,35 @@ def test_process_chapter_appendix_datatypes(self, tmp_path):
with open(test_out / 'appx_a.html') as f:
text = f.read()
assert text.find('data-type="appendix"') > -1

def test_process_appendix_with_subsections(self, tmp_path, capsys):
"""
ensure subsections are getting data-typed appropriately when
they're a part of an appendix
"""
test_env = tmp_path / 'tmp'
test_out = test_env / 'output'
test_env.mkdir()
test_out.mkdir()
shutil.copytree('tests/example_book/_build/html/notebooks',
test_env, dirs_exist_ok=True)

# prep files
os.rename(test_env / 'ch02.00.html', test_env / 'appx_a.00.html')
os.rename(test_env / 'ch02.01.html', test_env / 'appx_a.01.html')
os.rename(test_env / 'ch02.02.html', test_env / 'appx_a.02.html')

process_chapter([
test_env / 'appx_a.00.html',
test_env / 'appx_a.01.html',
test_env / 'appx_a.02.html',
], test_env, test_out)

with open(test_out / 'appx_a.00.html') as f:
text = f.read()
assert 'data-type="sect1"' in text
assert 'data-type="sect2"' in text
assert 'data-type="sect3"' in text
assert 'data-type="sect4"' in text
assert 'data-type="sect5"' in text
assert text.find('data-type="appendix"') > -1

0 comments on commit 5bd48b3

Please sign in to comment.