Refactor chapter file processing

This refactor gets us data-typing for non-chapters (e.g., prefaces, but mostly appendixes) with subparts for free, and also cleans the code up. We move the logic to apply the chapter-wide data-types to its own function and allow the (new/old) process_chapter_soup function to take either a Path (single-file chapter) or a list (a chapter with subfiles). **NOTE**: We have to import `Union` from `typing` to support hinting for this; Python 3.10 gives us a much cleaner `|` operator, but Atlas currently is on 3.9. Update: parallelism in the initial if-check in process_chapter_soup
oreillymedia · Dec 22, 2022 · 5bd48b3 · 5bd48b3
1 parent 35465f6
commit 5bd48b3
Show file tree

Hide file tree

Showing 2 changed files with 85 additions and 60 deletions.
diff --git a/jupyter_book_to_htmlbook/file_processing.py b/jupyter_book_to_htmlbook/file_processing.py
@@ -1,6 +1,7 @@
 import logging
 import re
 from pathlib import Path
+from typing import Union
 from bs4 import BeautifulSoup  # type: ignore
 from .admonition_processing import process_admonitions
 from .figure_processing import process_figures, process_informal_figs
@@ -83,9 +84,11 @@ def promote_headings(chapter):
     return chapter
 
 
-def process_chapter_single_file(toc_element):
-    """ single-file chapter processing """
-    ch_name = toc_element.stem
+def apply_datatype(chapter, ch_name):
+    """
+    Does a best-guess application of a data-type based on file name.
+    """
+    ch_stub = re.sub('[^a-zA-Z]', '', ch_name)
 
     # list of front and back matter guessed-at filenames
     front_matter = ['preface', 'notation', 'prereqs',
@@ -98,25 +101,6 @@ def process_chapter_single_file(toc_element):
                           "afterword", "conclusion", 'foreword',
                           'introduction', 'preface']
 
-    with open(toc_element, 'r') as f:
-        base_soup = BeautifulSoup(f, 'lxml')
-
-    # perform initial swapping and namespace designation
-    try:
-        chapter = base_soup.find_all('section')[0]
-        chapter['xmlns'] = 'http://www.w3.org/1999/xhtml'  # type: ignore
-
-    except IndexError:  # does not have a section class for top-level
-        logging.warning("Looks like {toc_element.name} is malformed.")
-        return None, None
-
-    # promote headings
-    chapter = promote_headings(chapter)
-
-    # apply appropriate data-type (best guess)
-
-    ch_stub = re.sub('[^a-zA-Z]', '', ch_name)
-
     if ch_stub.lower() in front_matter or ch_name in front_matter:
         if ch_stub.lower() in allowed_data_types:
             chapter['data-type'] = ch_stub.lower()  # type: ignore
@@ -133,6 +117,45 @@ def process_chapter_single_file(toc_element):
         chapter['data-type'] = 'chapter'  # type: ignore
     del chapter['class']  # type: ignore
 
+    return chapter
+
+
+def process_chapter_soup(toc_element: Union[Path, list[Path]]):
+    """ unified file chapter processing """
+
+    if isinstance(toc_element, list):  # i.e., an ordered list of chapter parts
+        chapter_file = toc_element[0]
+        chapter_parts = toc_element[1:]
+    else:
+        chapter_file = toc_element
+        chapter_parts = None
+
+    ch_name = chapter_file.stem
+
+    with open(chapter_file, 'r') as f:
+        base_soup = BeautifulSoup(f, 'lxml')
+
+    # perform initial swapping and namespace designation
+    try:
+        chapter = base_soup.find_all('section')[0]
+        chapter['xmlns'] = 'http://www.w3.org/1999/xhtml'  # type: ignore
+        del chapter['class']
+
+    except IndexError:  # does not have a section class for top-level
+        logging.warning("Looks like {toc_element.name} is malformed.")
+        return None, None
+
+    # promote subheadings within "base" chapter
+    chapter = promote_headings(chapter)
+
+    if chapter_parts:
+        for subfile in chapter_parts:
+            subsection = process_chapter_subparts(subfile)
+            chapter.append(subsection)
+
+    # apply appropriate data-type (best guess)
+    chapter = apply_datatype(chapter, ch_name)
+
     return chapter, ch_name
 
 
@@ -155,29 +178,6 @@ def process_chapter_subparts(subfile):
     return section
 
 
-def compile_chapter_parts(ordered_chapter_files_list):
-    """
-    Takes a list of chapter file URIs and returns a basic, sectioned
-    chapter soup (i.e., with no other htmlbook optimizations)
-    """
-    # work with main file
-    base_chapter_file = ordered_chapter_files_list[0]
-    with open(base_chapter_file, 'r') as f:
-        base_soup = BeautifulSoup(f, 'lxml')
-    sections = base_soup.find_all('section')
-    chapter = sections[0]  # first section is the "main" section
-    chapter['data-type'] = 'chapter'  # type: ignore
-    chapter['xmlns'] = 'http://www.w3.org/1999/xhtml'  # type: ignore
-    del chapter['class']  # type: ignore
-
-    # work with subfiles
-    for subfile in ordered_chapter_files_list[1:]:
-        subsection = process_chapter_subparts(subfile)
-        chapter.append(subsection)  # type: ignore
-
-    return chapter
-
-
 def process_chapter(toc_element,
                     source_dir,
                     build_dir=Path('.'),
@@ -189,18 +189,11 @@ def process_chapter(toc_element,
     that the files are in some /html/ directory or some such
     """
 
-    if isinstance(toc_element, Path):  # single-file chapter
-        chapter, ch_name = process_chapter_single_file(toc_element)
-
-        # if the file happens to be bad, just return (logged elsewhere)
-        if chapter is None or ch_name is None:
-            return
+    chapter, ch_name = process_chapter_soup(toc_element)
 
-    else:  # i.e., an ordered list of chapter parts
-        chapter = compile_chapter_parts(toc_element)
-        ch_name = toc_element[0].stem
+    if not chapter:  # guard against malformed files
+        return
 
-    # see where we're at
     logging.info(f"Processing {ch_name}...")
 
     # perform cleans and processing

diff --git a/tests/test_file_processing.py b/tests/test_file_processing.py
@@ -3,9 +3,8 @@
 import pytest
 import shutil
 from jupyter_book_to_htmlbook.file_processing import (
-        compile_chapter_parts,
         process_chapter,
-        process_chapter_single_file
+        process_chapter_soup
 )
 
 
@@ -22,11 +21,11 @@ def test_compile_chapter_parts_happy_path(self, tmp_path):
         shutil.copytree('tests/example_book/_build/html/notebooks',
                         test_env, dirs_exist_ok=True)
 
-        result = compile_chapter_parts([
+        result = process_chapter_soup([
             test_env / 'ch02.00.html',
             test_env / 'ch02.01.html',
             test_env / 'ch02.02.html',
-            ])
+            ])[0]
         # the resulting section should have a data-type of "chapter"
         assert result["data-type"] == "chapter"
         # number of level-1 subsections should be one less than the group
@@ -93,7 +92,7 @@ def test_chapter_promote_headings(self, tmp_path, caplog):
 <a class="headerlink" href="#summary" title="Permalink to this headline">#</a>
 </h2>
 <p>Finally, a summary.</p></section></section>""")
-        result = process_chapter_single_file(tmp_path / 'ch.html')[0]
+        result = process_chapter_soup(tmp_path / 'ch.html')[0]
         assert str(result) == """<section data-type="chapter" id="this""" + \
                               """-is-another-subheading" xmlns="http:/""" + \
                               """/www.w3.org/1999/xhtml">
@@ -217,6 +216,7 @@ def test_process_chapter_no_section(self, tmp_path):
             text = f.read()
             assert text.find('xmlns="http://www.w3.org/1999/xhtml"') > -1
 
+    @pytest.mark.in_dev
     def test_process_chapter_totally_invalid_file(self, tmp_path, caplog):
         """
         if we ever try to process something that's super malformed, don't,
@@ -330,3 +330,35 @@ def test_process_chapter_appendix_datatypes(self, tmp_path):
         with open(test_out / 'appx_a.html') as f:
             text = f.read()
             assert text.find('data-type="appendix"') > -1
+
+    def test_process_appendix_with_subsections(self, tmp_path, capsys):
+        """
+        ensure subsections are getting data-typed appropriately when
+        they're a part of an appendix
+        """
+        test_env = tmp_path / 'tmp'
+        test_out = test_env / 'output'
+        test_env.mkdir()
+        test_out.mkdir()
+        shutil.copytree('tests/example_book/_build/html/notebooks',
+                        test_env, dirs_exist_ok=True)
+
+        # prep files
+        os.rename(test_env / 'ch02.00.html', test_env / 'appx_a.00.html')
+        os.rename(test_env / 'ch02.01.html', test_env / 'appx_a.01.html')
+        os.rename(test_env / 'ch02.02.html', test_env / 'appx_a.02.html')
+
+        process_chapter([
+            test_env / 'appx_a.00.html',
+            test_env / 'appx_a.01.html',
+            test_env / 'appx_a.02.html',
+            ], test_env, test_out)
+
+        with open(test_out / 'appx_a.00.html') as f:
+            text = f.read()
+            assert 'data-type="sect1"' in text
+            assert 'data-type="sect2"' in text
+            assert 'data-type="sect3"' in text
+            assert 'data-type="sect4"' in text
+            assert 'data-type="sect5"' in text
+            assert text.find('data-type="appendix"') > -1