diff --git a/pdf2zh/converter.py b/pdf2zh/converter.py index 2eb6d9cd..3bc18fb6 100644 --- a/pdf2zh/converter.py +++ b/pdf2zh/converter.py @@ -41,6 +41,10 @@ log = logging.getLogger(__name__) +shs_name = "shs" +noto_name = "noto" + + class PDFConverterEx(PDFConverter): def __init__( self, @@ -134,6 +138,7 @@ def __init__( lang_out: str = "", service: str = "", resfont: str = "", + shs: Font = None, noto: Font = None, envs: Dict = None, prompt: List = None, @@ -144,6 +149,7 @@ def __init__( self.thread = thread self.layout = layout self.resfont = resfont + self.shs = shs self.noto = noto self.translator: BaseTranslator = None param = service.split(":", 1) @@ -358,8 +364,10 @@ def worker(s: str): # 多线程翻译 ############################################################ # C. 新文档排版 def raw_string(fcur: str, cstk: str): # 编码字符串 - if fcur == 'noto': + if fcur == noto_name: return "".join(["%04x" % self.noto.has_glyph(ord(c)) for c in cstk]) + elif fcur == shs_name: + return "".join(["%04x" % self.shs.has_glyph(ord(c)) for c in cstk]) elif isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度 return "".join(["%04x" % ord(c) for c in cstk]) else: @@ -403,8 +411,10 @@ def raw_string(fcur: str, cstk: str): # 编码字符串 pass if fcur_ is None: fcur_ = self.resfont # 默认非拉丁字体 - if fcur_ == 'noto': + if fcur_ == noto_name: # FIXME: change to CONST adv = self.noto.char_lengths(ch, size)[0] + elif fcur_ == shs_name: # FIXME: change to CONST + adv = self.shs.char_lengths(ch, size)[0] else: adv = self.fontmap[fcur_].char_width(ord(ch)) * size ptr += 1 diff --git a/pdf2zh/high_level.py b/pdf2zh/high_level.py index e5eea232..e0df332f 100644 --- a/pdf2zh/high_level.py +++ b/pdf2zh/high_level.py @@ -4,6 +4,7 @@ import io import os import sys +from tabnanny import verbose import tempfile import urllib.request from asyncio import CancelledError @@ -20,10 +21,14 @@ from pdfminer.pdfparser import PDFParser from pymupdf import Document, Font -from pdf2zh.converter import TranslateConverter +from pdf2zh.converter import TranslateConverter, shs_name, noto_name from pdf2zh.doclayout import DocLayoutModel from pdf2zh.pdfinterp import PDFPageInterpreterEx + +# FIXME +USE_SHS_FONT = True + model = DocLayoutModel.load_available() resfont_map = { @@ -85,6 +90,7 @@ def translate_patch( lang_out: str = "", service: str = "", resfont: str = "", + shs: Font = None, noto: Font = None, callback: object = None, cancellation_event: asyncio.Event = None, @@ -102,6 +108,7 @@ def translate_patch( lang_out, service, resfont, + shs, noto, kwarg.get("envs", {}), kwarg.get("prompt", []), @@ -183,11 +190,29 @@ def translate_stream( ): font_list = [("tiro", None)] noto = None + shs = None if lang_out.lower() in resfont_map: # CJK - resfont = resfont_map[lang_out.lower()] - font_list.append((resfont, None)) + if not USE_SHS_FONT: + resfont = resfont_map[lang_out.lower()] + font_list.append((resfont, None)) + else: + resfont = shs_name + # docker + ttf_path = os.environ.get("SHS_FONT_PATH", "/app/SourceHanSerif-Medium.ttc") + if not os.path.exists(ttf_path): + ttf_path = os.path.join( + tempfile.gettempdir(), "SourceHanSerif-Medium.ttc" + ) + if not os.path.exists(ttf_path): + print("Downloading SourceHanSerif font...") + urllib.request.urlretrieve( + "https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerif-Medium.ttc", + ttf_path, + ) + font_list.append((shs_name, ttf_path)) + shs = Font(shs_name, ttf_path) elif lang_out.lower() in noto_list: # noto - resfont = "noto" + resfont = noto_name # docker ttf_path = os.environ.get("NOTO_FONT_PATH", "/app/GoNotoKurrent-Regular.ttf") @@ -199,8 +224,8 @@ def translate_stream( "https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf", ttf_path, ) - font_list.append(("noto", ttf_path)) - noto = Font("noto", ttf_path) + font_list.append((noto_name, ttf_path)) + noto = Font(noto_name, ttf_path) else: # fallback resfont = "china-ss" font_list.append(("china-ss", None)) @@ -233,6 +258,7 @@ def translate_stream( pass fp = io.BytesIO() + doc_zh.save(fp) obj_patch: dict = translate_patch(fp, prompt=kwarg["prompt"], **locals()) @@ -247,7 +273,12 @@ def translate_stream( for id in range(page_count): doc_en.move_page(page_count + id, id * 2 + 1) - return doc_zh.write(deflate=1), doc_en.write(deflate=1) + doc_zh.subset_fonts(fallback=True) + doc_en.subset_fonts(fallback=True) + return ( + doc_zh.write(deflate=True, garbage=3, use_objstms=1), + doc_en.write(deflate=True, garbage=3, use_objstms=1), + ) def convert_to_pdfa(input_path, output_path):