diff --git a/jupyter_book_to_htmlbook/file_processing.py b/jupyter_book_to_htmlbook/file_processing.py index 62b0709..ad6c46a 100644 --- a/jupyter_book_to_htmlbook/file_processing.py +++ b/jupyter_book_to_htmlbook/file_processing.py @@ -1,6 +1,7 @@ import logging import re from pathlib import Path +from typing import Union from bs4 import BeautifulSoup # type: ignore from .admonition_processing import process_admonitions from .figure_processing import process_figures, process_informal_figs @@ -83,9 +84,11 @@ def promote_headings(chapter): return chapter -def process_chapter_single_file(toc_element): - """ single-file chapter processing """ - ch_name = toc_element.stem +def apply_datatype(chapter, ch_name): + """ + Does a best-guess application of a data-type based on file name. + """ + ch_stub = re.sub('[^a-zA-Z]', '', ch_name) # list of front and back matter guessed-at filenames front_matter = ['preface', 'notation', 'prereqs', @@ -98,25 +101,6 @@ def process_chapter_single_file(toc_element): "afterword", "conclusion", 'foreword', 'introduction', 'preface'] - with open(toc_element, 'r') as f: - base_soup = BeautifulSoup(f, 'lxml') - - # perform initial swapping and namespace designation - try: - chapter = base_soup.find_all('section')[0] - chapter['xmlns'] = 'http://www.w3.org/1999/xhtml' # type: ignore - - except IndexError: # does not have a section class for top-level - logging.warning("Looks like {toc_element.name} is malformed.") - return None, None - - # promote headings - chapter = promote_headings(chapter) - - # apply appropriate data-type (best guess) - - ch_stub = re.sub('[^a-zA-Z]', '', ch_name) - if ch_stub.lower() in front_matter or ch_name in front_matter: if ch_stub.lower() in allowed_data_types: chapter['data-type'] = ch_stub.lower() # type: ignore @@ -133,6 +117,45 @@ def process_chapter_single_file(toc_element): chapter['data-type'] = 'chapter' # type: ignore del chapter['class'] # type: ignore + return chapter + + +def process_chapter_soup(toc_element: Union[Path, list[Path]]): + """ unified file chapter processing """ + + if isinstance(toc_element, list): # i.e., an ordered list of chapter parts + chapter_file = toc_element[0] + chapter_parts = toc_element[1:] + else: + chapter_file = toc_element + chapter_parts = None + + ch_name = chapter_file.stem + + with open(chapter_file, 'r') as f: + base_soup = BeautifulSoup(f, 'lxml') + + # perform initial swapping and namespace designation + try: + chapter = base_soup.find_all('section')[0] + chapter['xmlns'] = 'http://www.w3.org/1999/xhtml' # type: ignore + del chapter['class'] + + except IndexError: # does not have a section class for top-level + logging.warning("Looks like {toc_element.name} is malformed.") + return None, None + + # promote subheadings within "base" chapter + chapter = promote_headings(chapter) + + if chapter_parts: + for subfile in chapter_parts: + subsection = process_chapter_subparts(subfile) + chapter.append(subsection) + + # apply appropriate data-type (best guess) + chapter = apply_datatype(chapter, ch_name) + return chapter, ch_name @@ -155,29 +178,6 @@ def process_chapter_subparts(subfile): return section -def compile_chapter_parts(ordered_chapter_files_list): - """ - Takes a list of chapter file URIs and returns a basic, sectioned - chapter soup (i.e., with no other htmlbook optimizations) - """ - # work with main file - base_chapter_file = ordered_chapter_files_list[0] - with open(base_chapter_file, 'r') as f: - base_soup = BeautifulSoup(f, 'lxml') - sections = base_soup.find_all('section') - chapter = sections[0] # first section is the "main" section - chapter['data-type'] = 'chapter' # type: ignore - chapter['xmlns'] = 'http://www.w3.org/1999/xhtml' # type: ignore - del chapter['class'] # type: ignore - - # work with subfiles - for subfile in ordered_chapter_files_list[1:]: - subsection = process_chapter_subparts(subfile) - chapter.append(subsection) # type: ignore - - return chapter - - def process_chapter(toc_element, source_dir, build_dir=Path('.'), @@ -189,18 +189,11 @@ def process_chapter(toc_element, that the files are in some /html/ directory or some such """ - if isinstance(toc_element, Path): # single-file chapter - chapter, ch_name = process_chapter_single_file(toc_element) - - # if the file happens to be bad, just return (logged elsewhere) - if chapter is None or ch_name is None: - return + chapter, ch_name = process_chapter_soup(toc_element) - else: # i.e., an ordered list of chapter parts - chapter = compile_chapter_parts(toc_element) - ch_name = toc_element[0].stem + if not chapter: # guard against malformed files + return - # see where we're at logging.info(f"Processing {ch_name}...") # perform cleans and processing diff --git a/tests/test_file_processing.py b/tests/test_file_processing.py index b545c69..dfc2234 100644 --- a/tests/test_file_processing.py +++ b/tests/test_file_processing.py @@ -3,9 +3,8 @@ import pytest import shutil from jupyter_book_to_htmlbook.file_processing import ( - compile_chapter_parts, process_chapter, - process_chapter_single_file + process_chapter_soup ) @@ -22,11 +21,11 @@ def test_compile_chapter_parts_happy_path(self, tmp_path): shutil.copytree('tests/example_book/_build/html/notebooks', test_env, dirs_exist_ok=True) - result = compile_chapter_parts([ + result = process_chapter_soup([ test_env / 'ch02.00.html', test_env / 'ch02.01.html', test_env / 'ch02.02.html', - ]) + ])[0] # the resulting section should have a data-type of "chapter" assert result["data-type"] == "chapter" # number of level-1 subsections should be one less than the group @@ -93,7 +92,7 @@ def test_chapter_promote_headings(self, tmp_path, caplog): #

Finally, a summary.

""") - result = process_chapter_single_file(tmp_path / 'ch.html')[0] + result = process_chapter_soup(tmp_path / 'ch.html')[0] assert str(result) == """
@@ -217,6 +216,7 @@ def test_process_chapter_no_section(self, tmp_path): text = f.read() assert text.find('xmlns="http://www.w3.org/1999/xhtml"') > -1 + @pytest.mark.in_dev def test_process_chapter_totally_invalid_file(self, tmp_path, caplog): """ if we ever try to process something that's super malformed, don't, @@ -330,3 +330,35 @@ def test_process_chapter_appendix_datatypes(self, tmp_path): with open(test_out / 'appx_a.html') as f: text = f.read() assert text.find('data-type="appendix"') > -1 + + def test_process_appendix_with_subsections(self, tmp_path, capsys): + """ + ensure subsections are getting data-typed appropriately when + they're a part of an appendix + """ + test_env = tmp_path / 'tmp' + test_out = test_env / 'output' + test_env.mkdir() + test_out.mkdir() + shutil.copytree('tests/example_book/_build/html/notebooks', + test_env, dirs_exist_ok=True) + + # prep files + os.rename(test_env / 'ch02.00.html', test_env / 'appx_a.00.html') + os.rename(test_env / 'ch02.01.html', test_env / 'appx_a.01.html') + os.rename(test_env / 'ch02.02.html', test_env / 'appx_a.02.html') + + process_chapter([ + test_env / 'appx_a.00.html', + test_env / 'appx_a.01.html', + test_env / 'appx_a.02.html', + ], test_env, test_out) + + with open(test_out / 'appx_a.00.html') as f: + text = f.read() + assert 'data-type="sect1"' in text + assert 'data-type="sect2"' in text + assert 'data-type="sect3"' in text + assert 'data-type="sect4"' in text + assert 'data-type="sect5"' in text + assert text.find('data-type="appendix"') > -1