Skip to content

Commit

Permalink
basic logic
Browse files Browse the repository at this point in the history
  • Loading branch information
timelic committed Dec 25, 2024
1 parent 80c4a1b commit dba927e
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 9 deletions.
14 changes: 12 additions & 2 deletions pdf2zh/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,10 @@
log = logging.getLogger(__name__)


shs_name = "shs"
noto_name = "noto"


class PDFConverterEx(PDFConverter):
def __init__(
self,
Expand Down Expand Up @@ -134,6 +138,7 @@ def __init__(
lang_out: str = "",
service: str = "",
resfont: str = "",
shs: Font = None,
noto: Font = None,
envs: Dict = None,
prompt: List = None,
Expand All @@ -144,6 +149,7 @@ def __init__(
self.thread = thread
self.layout = layout
self.resfont = resfont
self.shs = shs
self.noto = noto
self.translator: BaseTranslator = None
param = service.split(":", 1)
Expand Down Expand Up @@ -358,8 +364,10 @@ def worker(s: str): # 多线程翻译
############################################################
# C. 新文档排版
def raw_string(fcur: str, cstk: str): # 编码字符串
if fcur == 'noto':
if fcur == noto_name:
return "".join(["%04x" % self.noto.has_glyph(ord(c)) for c in cstk])
elif fcur == shs_name:
return "".join(["%04x" % self.shs.has_glyph(ord(c)) for c in cstk])
elif isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度
return "".join(["%04x" % ord(c) for c in cstk])
else:
Expand Down Expand Up @@ -403,8 +411,10 @@ def raw_string(fcur: str, cstk: str): # 编码字符串
pass
if fcur_ is None:
fcur_ = self.resfont # 默认非拉丁字体
if fcur_ == 'noto':
if fcur_ == noto_name: # FIXME: change to CONST
adv = self.noto.char_lengths(ch, size)[0]
elif fcur_ == shs_name: # FIXME: change to CONST
adv = self.shs.char_lengths(ch, size)[0]
else:
adv = self.fontmap[fcur_].char_width(ord(ch)) * size
ptr += 1
Expand Down
45 changes: 38 additions & 7 deletions pdf2zh/high_level.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import io
import os
import sys
from tabnanny import verbose
import tempfile
import urllib.request
from asyncio import CancelledError
Expand All @@ -20,10 +21,14 @@
from pdfminer.pdfparser import PDFParser
from pymupdf import Document, Font

from pdf2zh.converter import TranslateConverter
from pdf2zh.converter import TranslateConverter, shs_name, noto_name
from pdf2zh.doclayout import DocLayoutModel
from pdf2zh.pdfinterp import PDFPageInterpreterEx


# FIXME
USE_SHS_FONT = True

model = DocLayoutModel.load_available()

resfont_map = {
Expand Down Expand Up @@ -85,6 +90,7 @@ def translate_patch(
lang_out: str = "",
service: str = "",
resfont: str = "",
shs: Font = None,
noto: Font = None,
callback: object = None,
cancellation_event: asyncio.Event = None,
Expand All @@ -102,6 +108,7 @@ def translate_patch(
lang_out,
service,
resfont,
shs,
noto,
kwarg.get("envs", {}),
kwarg.get("prompt", []),
Expand Down Expand Up @@ -183,11 +190,29 @@ def translate_stream(
):
font_list = [("tiro", None)]
noto = None
shs = None
if lang_out.lower() in resfont_map: # CJK
resfont = resfont_map[lang_out.lower()]
font_list.append((resfont, None))
if not USE_SHS_FONT:
resfont = resfont_map[lang_out.lower()]
font_list.append((resfont, None))
else:
resfont = shs_name
# docker
ttf_path = os.environ.get("SHS_FONT_PATH", "/app/SourceHanSerif-Medium.ttc")
if not os.path.exists(ttf_path):
ttf_path = os.path.join(
tempfile.gettempdir(), "SourceHanSerif-Medium.ttc"
)
if not os.path.exists(ttf_path):
print("Downloading SourceHanSerif font...")
urllib.request.urlretrieve(
"https://github.com/timelic/source-han-serif/releases/download/main/SourceHanSerif-Medium.ttc",
ttf_path,
)
font_list.append((shs_name, ttf_path))
shs = Font(shs_name, ttf_path)
elif lang_out.lower() in noto_list: # noto
resfont = "noto"
resfont = noto_name
# docker
ttf_path = os.environ.get("NOTO_FONT_PATH", "/app/GoNotoKurrent-Regular.ttf")

Expand All @@ -199,8 +224,8 @@ def translate_stream(
"https://github.com/satbyy/go-noto-universal/releases/download/v7.0/GoNotoKurrent-Regular.ttf",
ttf_path,
)
font_list.append(("noto", ttf_path))
noto = Font("noto", ttf_path)
font_list.append((noto_name, ttf_path))
noto = Font(noto_name, ttf_path)
else: # fallback
resfont = "china-ss"
font_list.append(("china-ss", None))
Expand Down Expand Up @@ -233,6 +258,7 @@ def translate_stream(
pass

fp = io.BytesIO()

doc_zh.save(fp)
obj_patch: dict = translate_patch(fp, prompt=kwarg["prompt"], **locals())

Expand All @@ -247,7 +273,12 @@ def translate_stream(
for id in range(page_count):
doc_en.move_page(page_count + id, id * 2 + 1)

return doc_zh.write(deflate=1), doc_en.write(deflate=1)
doc_zh.subset_fonts(fallback=True)
doc_en.subset_fonts(fallback=True)
return (
doc_zh.write(deflate=True, garbage=3, use_objstms=1),
doc_en.write(deflate=True, garbage=3, use_objstms=1),
)


def convert_to_pdfa(input_path, output_path):
Expand Down

0 comments on commit dba927e

Please sign in to comment.