diff --git a/.github/workflows/python-build.yml b/.github/workflows/python-build.yml new file mode 100644 index 00000000..11433830 --- /dev/null +++ b/.github/workflows/python-build.yml @@ -0,0 +1,31 @@ +name: Build Python Package + +on: + push: + branches: + - main + pull_request: + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.x' + + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build flake8 black + + - name: Check code format + run: | + black --check --diff --color pdf2zh/*.py + flake8 + + - name: Build package + run: python -m build diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..43d3f66d --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,14 @@ +# See https://pre-commit.com for more information +# See https://pre-commit.com/hooks.html for more hooks +files: '^.*\.py$' +repos: +- repo: local + hooks: + - id: black + name: black + entry: black --check --diff --color + language: python + - id: flake8 + name: flake8 + entry: flake8 + language: python diff --git a/README.md b/README.md index 05b11657..c1ce5e8f 100644 --- a/README.md +++ b/README.md @@ -37,13 +37,15 @@ Feel free to provide feedback in [GitHub Issues](https://github.com/Byaidu/PDFMa

Updates

+- [Nov. 23 2024] Firewall for preventing web bots *(by [@Byaidu](https://github.com/Byaidu))* +- [Nov. 22 2024] GUI now supports Italian, and has been improved *(by [@Byaidu](https://github.com/Byaidu), [@reycn](https://github.com/reycn))* +- [Nov. 22 2024] You can now share your deployed service to others *(by [@Zxis233](https://github.com/Zxis233))* +- [Nov. 22 2024] Now supportsTencent Translation *(by [@hellofinch](https://github.com/hellofinch))* - [Nov. 21 2024] GUI now supports downloading dual-document *(by [@reycn](https://github.com/reycn))* - [Nov. 20 2024] GUI now supports specifying Ollama and OpenAI models *(by [@IuvenisSapiens](https://github.com/IuvenisSapiens), [@Byaidu](https://github.com/Byaidu))* - [Nov. 20 2024] 🌟 [Demo](#demo) online! *(by [@reycn](https://github.com/reycn))* - [Nov. 20 2024] Supports [Docker](#docker) *(by [@Byaidu](https://github.com/Byaidu))* - [Nov. 20 2024] Supports [multiple-threads translation](#threads) *(by [@Byaidu](https://github.com/Byaidu))* -- [Nov. 19 2024] Provides an [interactive graphical user interface](#gui) *(by [@reycn](https://github.com/reycn))* -- [Nov. 18 2024] Supports [more services: DeepL, DeepLX, and Azure](#services) *(by [@reycn](https://github.com/reycn), [@Hanaasagi](https://github.com/Hanaasagi))*

Preview

diff --git a/README_zh-CN.md b/README_zh-CN.md index a5c7b866..cde585a1 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -37,13 +37,16 @@

近期更新

+ +- [Nov. 23 2024] 防止网页爬虫的防火墙 *(by [@Byaidu](https://github.com/Byaidu))* +- [Nov. 22 2024] 图形用户界面现已支持意大利语,并获得了一些更新 *(by [@Byaidu](https://github.com/Byaidu), [@reycn](https://github.com/reycn))* +- [Nov. 22 2024] 现在你可以将自己部署的服务分享给朋友了 *(by [@Zxis233](https://github.com/Zxis233))* +- [Nov. 22 2024] Now supportsTencent Translation *(by [@hellofinch](https://github.com/hellofinch))* - [Nov. 21 2024] 图形用户界面现在支持下载双语文档 *(by [@reycn](https://github.com/reycn))* - [Nov. 20 2024] 图形用户界面现在支持指定 Ollama 和 OpenAI 的模型 *(by [@IuvenisSapiens](https://github.com/IuvenisSapiens), [@Byaidu](https://github.com/Byaidu))* - [Nov. 20 2024] 🌟 提供了 [在线演示](#demo)! *(by [@reycn](https://github.com/reycn))* - [Nov. 20 2024] 支持 [容器化部署](#docker) *(by [@Byaidu](https://github.com/Byaidu))* -- [Nov. 20 2024] 支持速度更快的 [多线程翻译](#threads) *(by [@Byaidu](https://github.com/Byaidu))* -- [Nov. 19 2024] 提供了[图形用户界面](#gui) *(by [@reycn](https://github.com/reycn))* -- [Nov. 18 2024] 支持更多翻译服务,包含 [DeepL, DeepLX, 和 Azure](#services) *(by [@reycn](https://github.com/reycn), [@Hanaasagi](https://github.com/Hanaasagi))* +- [Nov. 20 2024] 支持速度更快的 [多线程翻译](#threads) *(by [@Byaidu](https://github.com/Byaidu))*

效果预览

diff --git a/pdf2zh/cache.py b/pdf2zh/cache.py index 3e45b41c..275cd712 100644 --- a/pdf2zh/cache.py +++ b/pdf2zh/cache.py @@ -3,9 +3,10 @@ import time import hashlib import shutil -cache_dir = os.path.join(tempfile.gettempdir(), 'cache') + +cache_dir = os.path.join(tempfile.gettempdir(), "cache") os.makedirs(cache_dir, exist_ok=True) -time_filename = 'update_time' +time_filename = "update_time" max_cache = 5 @@ -16,25 +17,30 @@ def deterministic_hash(obj): def get_dirs(): - dirs = [os.path.join(cache_dir, dir) for dir in os.listdir(cache_dir) if os.path.isdir(os.path.join(cache_dir, dir))] + dirs = [ + os.path.join(cache_dir, dir) + for dir in os.listdir(cache_dir) + if os.path.isdir(os.path.join(cache_dir, dir)) + ] return dirs def get_time(dir): try: timefile = os.path.join(dir, time_filename) - t = float(open(timefile, encoding='utf-8').read()) + t = float(open(timefile, encoding="utf-8").read()) return t except FileNotFoundError: # handle the error as needed, for now we'll just return a default value - return float('inf') # This ensures that this directory will be the first to be removed if required - + return float( + "inf" + ) # This ensures that this directory will be the first to be removed if required def write_time(dir): timefile = os.path.join(dir, time_filename) t = time.time() - print(t, file=open(timefile, "w", encoding='utf-8'), end='') + print(t, file=open(timefile, "w", encoding="utf-8"), end="") def argmin(iterable): @@ -44,7 +50,9 @@ def argmin(iterable): def remove_extra(): dirs = get_dirs() for dir in dirs: - if not os.path.isdir(dir): # This line might be redundant now, as get_dirs() ensures only directories are returned + if not os.path.isdir( + dir + ): # This line might be redundant now, as get_dirs() ensures only directories are returned os.remove(dir) try: get_time(dir) @@ -73,11 +81,11 @@ def create_cache(hash_key): def load_paragraph(hash_key, hash_key_paragraph): filename = os.path.join(cache_dir, hash_key, hash_key_paragraph) if os.path.exists(filename): - return open(filename, encoding='utf-8').read() + return open(filename, encoding="utf-8").read() else: return None def write_paragraph(hash_key, hash_key_paragraph, paragraph): filename = os.path.join(cache_dir, hash_key, hash_key_paragraph) - print(paragraph, file=open(filename, "w", encoding='utf-8'), end='') \ No newline at end of file + print(paragraph, file=open(filename, "w", encoding="utf-8"), end="") diff --git a/pdf2zh/converter.py b/pdf2zh/converter.py index 587159c3..70594ffc 100644 --- a/pdf2zh/converter.py +++ b/pdf2zh/converter.py @@ -1,3 +1,45 @@ +from pdf2zh.utils import ( + AnyIO, + Matrix, + PathSegment, + Point, + Rect, + apply_matrix_pt, + bbox2str, + enc, + make_compat_str, + mult_matrix, + matrix_scale, +) +from pdf2zh.pdftypes import PDFStream +from pdf2zh.pdfpage import PDFPage +from pdf2zh.pdfinterp import PDFGraphicState, PDFResourceManager +from pdf2zh.pdffont import PDFFont, PDFUnicodeNotDefined, PDFCIDFont +from pdf2zh.pdfexceptions import PDFValueError +from pdf2zh.pdfdevice import PDFTextDevice +from pdf2zh.pdfcolor import PDFColorSpace +from pdf2zh.layout import ( + LAParams, + LTAnno, + LTChar, + LTComponent, + LTCurve, + LTFigure, + LTImage, + LTItem, + LTLayoutContainer, + LTLine, + LTPage, + LTRect, + LTText, + LTTextBox, + LTTextBoxVertical, + LTTextGroup, + LTTextLine, + TextGroupElement, +) +from pdf2zh.image import ImageWriter +from pdf2zh import utils import io import logging import re @@ -28,55 +70,15 @@ OpenAITranslator, AzureTranslator, ) + + def remove_control_characters(s): - return "".join(ch for ch in s if unicodedata.category(ch)[0]!="C") + return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C") -from pdf2zh import utils -from pdf2zh.image import ImageWriter -from pdf2zh.layout import ( - LAParams, - LTAnno, - LTChar, - LTComponent, - LTContainer, - LTCurve, - LTFigure, - LTImage, - LTItem, - LTLayoutContainer, - LTLine, - LTPage, - LTRect, - LTText, - LTTextBox, - LTTextBoxVertical, - LTTextGroup, - LTTextLine, - TextGroupElement, -) -from pdf2zh.pdfcolor import PDFColorSpace -from pdf2zh.pdfdevice import PDFTextDevice -from pdf2zh.pdfexceptions import PDFValueError -from pdf2zh.pdffont import PDFFont, PDFUnicodeNotDefined, PDFCIDFont -from pdf2zh.pdfinterp import PDFGraphicState, PDFResourceManager -from pdf2zh.pdfpage import PDFPage -from pdf2zh.pdftypes import PDFStream -from pdf2zh.utils import ( - AnyIO, - Matrix, - PathSegment, - Point, - Rect, - apply_matrix_pt, - bbox2str, - enc, - make_compat_str, - mult_matrix, - matrix_scale, -) log = logging.getLogger(__name__) + class PDFLayoutAnalyzer(PDFTextDevice): cur_item: LTLayoutContainer ctm: Matrix @@ -188,7 +190,7 @@ def paint_path( # Note: 'ml', in conditional above, is a frequent anomaly # that we want to support. line = LTLine( - gstate.linewidth*matrix_scale(self.ctm), + gstate.linewidth * matrix_scale(self.ctm), pts[0], pts[1], stroke, @@ -210,7 +212,7 @@ def paint_path( ) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0) if is_closed_loop and has_square_coordinates: rect = LTRect( - gstate.linewidth*matrix_scale(self.ctm), + gstate.linewidth * matrix_scale(self.ctm), (*pts[0], *pts[2]), stroke, fill, @@ -223,7 +225,7 @@ def paint_path( self.cur_item.add(rect) else: curve = LTCurve( - gstate.linewidth*matrix_scale(self.ctm), + gstate.linewidth * matrix_scale(self.ctm), pts, stroke, fill, @@ -236,7 +238,7 @@ def paint_path( self.cur_item.add(curve) else: curve = LTCurve( - gstate.linewidth*matrix_scale(self.ctm), + gstate.linewidth * matrix_scale(self.ctm), pts, stroke, fill, @@ -279,7 +281,7 @@ def render_char( graphicstate, ) self.cur_item.add(item) - item.cid=cid # hack + item.cid = cid # hack return item.adv def handle_undefined_char(self, font: PDFFont, cid: int) -> str: @@ -355,7 +357,7 @@ def __init__( vfont: str = None, vchar: str = None, thread: int = 0, - layout = {}, + layout={}, lang_in: str = "", lang_out: str = "", service: str = "", @@ -367,7 +369,7 @@ def __init__( self.vchar = vchar self.thread = thread self.layout = layout - param=service.split(':',1) + param = service.split(":", 1) if param[0] == "google": self.translator: BaseTranslator = GoogleTranslator( service, lang_out, lang_in, None @@ -384,11 +386,11 @@ def __init__( self.translator: BaseTranslator = OllamaTranslator( service, lang_out, lang_in, param[1] ) - elif param[0] == 'openai': + elif param[0] == "openai": self.translator: BaseTranslator = OpenAITranslator( service, lang_out, lang_in, param[1] ) - elif param[0] == 'azure': + elif param[0] == "azure": self.translator: BaseTranslator = AzureTranslator( service, lang_out, lang_in, None ) @@ -404,173 +406,255 @@ def write_text(self, text: str) -> None: def receive_layout(self, ltpage: LTPage): def render(item: LTItem) -> None: - xt=None # 上一个字符 - sstk=[] # 段落文字栈 - vstk=[] # 公式符号组 - vlstk=[] # 公式线条组 - vfix=0 # 公式纵向偏移 - vbkt=0 # 段落公式括号计数 - pstk=[] # 段落属性栈 - lstk=[] # 全局线条栈 - var=[] # 公式符号组栈 - varl=[] # 公式线条组栈 - varf=[] # 公式纵向偏移栈 - vlen=[] # 公式宽度栈 - xt_cls=-1 # 上一个字符所属段落 - vmax=ltpage.width/4 # 行内公式最大宽度 - ops="" # 渲染结果 - def vflag(font,char): # 匹配公式(和角标)字体 - if re.match(r'\(cid:',char): + xt = None # 上一个字符 + sstk = [] # 段落文字栈 + vstk = [] # 公式符号组 + vlstk = [] # 公式线条组 + vfix = 0 # 公式纵向偏移 + vbkt = 0 # 段落公式括号计数 + pstk = [] # 段落属性栈 + lstk = [] # 全局线条栈 + var = [] # 公式符号组栈 + varl = [] # 公式线条组栈 + varf = [] # 公式纵向偏移栈 + vlen = [] # 公式宽度栈 + xt_cls = -1 # 上一个字符所属段落 + vmax = ltpage.width / 4 # 行内公式最大宽度 + ops = "" # 渲染结果 + + def vflag(font, char): # 匹配公式(和角标)字体 + if re.match(r"\(cid:", char): return True if self.vfont: - if re.match(self.vfont,font): + if re.match(self.vfont, font): return True else: - if re.match(r'(CM[^R]|MS|XY|MT|BL|RM|EU|LA|RS|LINE|TeX-|rsfs|txsy|wasy|.*Mono|.*Code|.*Ital|.*Sym)',font): + if re.match( + r"(CM[^R]|MS|XY|MT|BL|RM|EU|LA|RS|LINE|TeX-|rsfs|txsy|wasy|.*Mono|.*Code|.*Ital|.*Sym)", + font, + ): return True if self.vchar: - if re.match(self.vchar,char): + if re.match(self.vchar, char): return True else: - if char and char!=' ' and (unicodedata.category(char[0]) in ['Lm','Mn','Sk','Sm','Zl','Zp','Zs'] or ord(char[0]) in range(0x370,0x400)): # 文字修饰符、数学符号、分隔符号、希腊字母 + if ( + char + and char != " " + and ( + unicodedata.category(char[0]) + in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"] + or ord(char[0]) in range(0x370, 0x400) + ) + ): # 文字修饰符、数学符号、分隔符号、希腊字母 return True return False - ptr=0 - item=list(item) - while ptrvmax and cls!=0): # 公式结束、段落边界、公式换行 - if vstk: # 公式出栈 - sstk[-1]+=f'$v{len(var)}$' - if not cur_v and cls==xt_cls and child.x0>max([vch.x0 for vch in vstk]): # and child.y1>vstk[0].y0: # 段落内公式转文字,行内公式修正 - vfix=vstk[0].y0-child.y0 + # ops+=f'ET [] 0 d 0 J 0.1 w {child.x0:f} + # {child.y0:f} {child.x1-child.x0:f} {child.y1-child.y0:f} re S Q BT ' + if ( + cls == 0 + or (cls == xt_cls and child.size < pstk[-1][4] * 0.79) + or vflag(fontname, child.get_text()) + or (child.matrix[0] == 0 and child.matrix[3] == 0) + ): # 有 0.76 的角标和 0.799 的大写,这里用 0.79 取中 + cur_v = True + if not cur_v: # 判定括号组是否属于公式 + if vstk and child.get_text() == "(": + cur_v = True + vbkt += 1 + if vbkt and child.get_text() == ")": + cur_v = True + vbkt -= 1 + if ( + not cur_v + or cls != xt_cls + or (abs(child.x0 - xt.x0) > vmax and cls != 0) + ): # 公式结束、段落边界、公式换行 + if vstk: # 公式出栈 + sstk[-1] += f"$v{len(var)}$" + if ( + not cur_v + and cls == xt_cls + and child.x0 > max([vch.x0 for vch in vstk]) + ): # and child.y1>vstk[0].y0: # 段落内公式转文字,行内公式修正 + vfix = vstk[0].y0 - child.y0 var.append(vstk) varl.append(vlstk) varf.append(vfix) - vstk=[] - vlstk=[] - vfix=0 - if not vstk: # 非公式或是公式开头 - if cls==xt_cls: # 同一段落 - if child.x0 > xt.x1 + 1: # 行内空格 - sstk[-1]+=' ' - elif child.x1 < xt.x0: # 换行空格 - sstk[-1]+=' ' - pstk[-1][6]=True # 标记原文段落存在换行 + vstk = [] + vlstk = [] + vfix = 0 + if not vstk: # 非公式或是公式开头 + if cls == xt_cls: # 同一段落 + if child.x0 > xt.x1 + 1: # 行内空格 + sstk[-1] += " " + elif child.x1 < xt.x0: # 换行空格 + sstk[-1] += " " + pstk[-1][6] = True # 标记原文段落存在换行 else: sstk.append("") - pstk.append([child.y0,child.x0,child.x0,child.x0,child.size,child.font,False]) - if not cur_v: # 文字入栈 - if child.size>pstk[-1][4]/0.79 or vflag(pstk[-1][5].fontname.split('+')[-1],'') or re.match(r'(.*Medi|.*Bold)',pstk[-1][5].fontname.split('+')[-1],re.IGNORECASE): # 小字体、公式或粗体开头,后续接文字,需要校正字体 - pstk[-1][0]-=child.size-pstk[-1][4] - pstk[-1][4]=child.size - pstk[-1][5]=child.font - sstk[-1]+=child.get_text() - else: # 公式入栈 - if not vstk and cls==xt_cls and child.x0>xt.x0: # and child.y1>xt.y0: # 段落内文字转公式,行内公式修正 - vfix=child.y0-xt.y0 + pstk.append( + [ + child.y0, + child.x0, + child.x0, + child.x0, + child.size, + child.font, + False, + ] + ) + if not cur_v: # 文字入栈 + if ( + child.size > pstk[-1][4] / 0.79 + or vflag(pstk[-1][5].fontname.split("+")[-1], "") + or re.match( + r"(.*Medi|.*Bold)", + pstk[-1][5].fontname.split("+")[-1], + re.IGNORECASE, + ) + ): # 小字体、公式或粗体开头,后续接文字,需要校正字体 + pstk[-1][0] -= child.size - pstk[-1][4] + pstk[-1][4] = child.size + pstk[-1][5] = child.font + sstk[-1] += child.get_text() + else: # 公式入栈 + if ( + not vstk and cls == xt_cls and child.x0 > xt.x0 + ): # and child.y1>xt.y0: # 段落内文字转公式,行内公式修正 + vfix = child.y0 - xt.y0 vstk.append(child) # 更新段落边界,段落内换行之后可能是公式开头 - pstk[-1][2]=min(pstk[-1][2],child.x0) - pstk[-1][3]=max(pstk[-1][3],child.x1) - xt=child - xt_cls=cls - elif isinstance(child, LTFigure): # 图表 + pstk[-1][2] = min(pstk[-1][2], child.x0) + pstk[-1][3] = max(pstk[-1][3], child.x1) + xt = child + xt_cls = cls + elif isinstance(child, LTFigure): # 图表 pass - elif isinstance(child, LTLine): # 线条 - layout=self.layout[ltpage.pageid] - h,w=layout.shape # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape - cx,cy=np.clip(int(child.x0),0,w-1),np.clip(int(child.y0),0,h-1) - cls=layout[cy,cx] - if vstk and cls==xt_cls: # 公式线条 + elif isinstance(child, LTLine): # 线条 + layout = self.layout[ltpage.pageid] + h, w = ( + layout.shape + ) # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape + cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip( + int(child.y0), 0, h - 1 + ) + cls = layout[cy, cx] + if vstk and cls == xt_cls: # 公式线条 vlstk.append(child) - else: # 全局线条 + else: # 全局线条 lstk.append(child) else: # print(child) pass - ptr+=1 + ptr += 1 # 处理结尾 - if vstk: # 公式出栈 - sstk[-1]+=f'$v{len(var)}$' + if vstk: # 公式出栈 + sstk[-1] += f"$v{len(var)}$" var.append(vstk) varl.append(vlstk) varf.append(vfix) - log.debug('\n==========[VSTACK]==========\n') - for id,v in enumerate(var): # 计算公式宽度 - l=max([vch.x1 for vch in v])-v[0].x0 - log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}') + log.debug("\n==========[VSTACK]==========\n") + for id, v in enumerate(var): # 计算公式宽度 + l = max([vch.x1 for vch in v]) - v[0].x0 # noqa: E741 + log.debug( + f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}' # noqa: E501 + ) vlen.append(l) - log.debug('\n==========[SSTACK]==========\n') - hash_key=cache.deterministic_hash("PDFMathTranslate") + log.debug("\n==========[SSTACK]==========\n") + hash_key = cache.deterministic_hash("PDFMathTranslate") cache.create_cache(hash_key) + @retry(wait=wait_fixed(1)) - def worker(s): # 多线程翻译 + def worker(s): # 多线程翻译 try: - hash_key_paragraph = cache.deterministic_hash((s,str(self.translator))) - new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存 + hash_key_paragraph = cache.deterministic_hash( + (s, str(self.translator)) + ) + new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存 if new is None: - new=self.translator.translate(s) - new=remove_control_characters(new) + new = self.translator.translate(s) + new = remove_control_characters(new) cache.write_paragraph(hash_key, hash_key_paragraph, new) return new except BaseException as e: if log.isEnabledFor(logging.DEBUG): log.exception(e) else: - log.exception(e,exc_info=False) + log.exception(e, exc_info=False) raise e - with concurrent.futures.ThreadPoolExecutor(max_workers=self.thread) as executor: + + with concurrent.futures.ThreadPoolExecutor( + max_workers=self.thread + ) as executor: news = list(executor.map(worker, sstk)) - def raw_string(fcur,cstk): # 编码字符串 - if isinstance(self.fontmap[fcur],PDFCIDFont): # 判断编码长度 + + def raw_string(fcur, cstk): # 编码字符串 + if isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度 return "".join(["%04x" % ord(c) for c in cstk]) else: return "".join(["%02x" % ord(c) for c in cstk]) - _x,_y=0,0 - for id,new in enumerate(news): # 排版文字和公式 - tx=x=pstk[id][1];y=pstk[id][0];lt=pstk[id][2];rt=pstk[id][3];ptr=0;size=pstk[id][4];font=pstk[id][5];lb=pstk[id][6] # 段落属性 - cstk='' # 单行文字栈 - fcur=fcur_=None # 单行字体 - log.debug(f"< {y} {x} {lt} {rt} {size} {font.fontname} {lb} > {sstk[id]} | {new}") + + _x, _y = 0, 0 + for id, new in enumerate(news): # 排版文字和公式 + tx = x = pstk[id][1] + y = pstk[id][0] + lt = pstk[id][2] + rt = pstk[id][3] + ptr = 0 + size = pstk[id][4] + font = pstk[id][5] + lb = pstk[id][6] # 段落属性 + cstk = "" # 单行文字栈 + fcur = fcur_ = None # 单行字体 + log.debug( + f"< {y} {x} {lt} {rt} {size} {font.fontname} {lb} > {sstk[id]} | {new}" + ) while True: - if ptr==len(new): # 到达段落结尾 + if ptr == len(new): # 到达段落结尾 if cstk: - ops+=f'/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur,cstk)}>] TJ ' + ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ " break - vy_regex=re.match(r'\$?\s*v([\d\s]+)\$',new[ptr:],re.IGNORECASE) # 匹配 $vn$ 公式标记,前面的 $ 有的时候会被丢掉 - mod=False # 当前公式是否为文字修饰符 - if vy_regex: # 加载公式 - ptr+=len(vy_regex.group(0)) + vy_regex = re.match( + r"\$?\s*v([\d\s]+)\$", new[ptr:], re.IGNORECASE + ) # 匹配 $vn$ 公式标记,前面的 $ 有的时候会被丢掉 + mod = False # 当前公式是否为文字修饰符 + if vy_regex: # 加载公式 + ptr += len(vy_regex.group(0)) try: - vid=int(vy_regex.group(1).replace(' ','')) - adv=vlen[vid] - except: - continue # 翻译器可能会自动补个越界的公式标记 - if len(var[vid])==1 and unicodedata.category(var[vid][0].get_text()[0]) in ['Lm','Mn','Sk']: # 文字修饰符 - mod=True - else: # 加载文字 - ch=new[ptr] + vid = int(vy_regex.group(1).replace(" ", "")) + adv = vlen[vid] + except Exception: + continue # 翻译器可能会自动补个越界的公式标记 + if len(var[vid]) == 1 and unicodedata.category( + var[vid][0].get_text()[0] + ) in [ + "Lm", + "Mn", + "Sk", + ]: # 文字修饰符 + mod = True + else: # 加载文字 + ch = new[ptr] # if font.char_width(ord(ch)): - fcur_=None + fcur_ = None # 原字体编码容易出问题,这里直接放弃掉 # try: # if font.widths.get(ord(ch)) and font.to_unichr(ord(ch))==ch: @@ -578,58 +662,84 @@ def raw_string(fcur,cstk): # 编码字符串 # except: # pass try: - if fcur_==None and self.fontmap['tiro'].to_unichr(ord(ch))==ch: - fcur_='tiro' # 默认英文字体 - except: + if ( + fcur_ is None + and self.fontmap["tiro"].to_unichr(ord(ch)) == ch + ): + fcur_ = "tiro" # 默认英文字体 + except Exception: pass - if fcur_==None: - fcur_='china-ss' # 默认中文字体 + if fcur_ is None: + fcur_ = "china-ss" # 默认中文字体 # print(self.fontid[font],fcur_,ch,font.char_width(ord(ch))) - adv=self.fontmap[fcur_].char_width(ord(ch))*size - ptr+=1 - if fcur_!=fcur or vy_regex or x+adv>rt+0.1*size: # 输出文字缓冲区:1.字体更新 2.插入公式 3.到达右边界(可能一整行都被符号化,这里需要考虑浮点误差) + adv = self.fontmap[fcur_].char_width(ord(ch)) * size + ptr += 1 + if ( + fcur_ != fcur or vy_regex or x + adv > rt + 0.1 * size + ): # 输出文字缓冲区:1.字体更新 2.插入公式 3.到达右边界(可能一整行都被符号化,这里需要考虑浮点误差) if cstk: - ops+=f'/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur,cstk)}>] TJ ' - cstk='' - if lb and x+adv>rt+0.1*size: # 到达右边界且原文段落存在换行 - x=lt - lang_space={'zh-CN':1.4,'zh-TW':1.4,'ja':1.1,'ko':1.2,'en':1.2} # CJK - y-=size*lang_space.get(self.translator.lang_out,1.1) # 小语种大多适配 1.1 - if vy_regex: # 插入公式 - fix=0 - if fcur!=None: # 段落内公式修正纵向偏移 - fix=varf[vid] - for vch in var[vid]: # 排版公式字符 - vc=chr(vch.cid) - ops+=f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x+vch.x0-var[vid][0].x0:f} {fix+y+vch.y0-var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font],vc)}>] TJ " + ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ " + cstk = "" + if lb and x + adv > rt + 0.1 * size: # 到达右边界且原文段落存在换行 + x = lt + lang_space = { + "zh-CN": 1.4, + "zh-TW": 1.4, + "ja": 1.1, + "ko": 1.2, + "en": 1.2, + } # CJK + y -= size * lang_space.get( + self.translator.lang_out, 1.1 + ) # 小语种大多适配 1.1 + if vy_regex: # 插入公式 + fix = 0 + if fcur is not None: # 段落内公式修正纵向偏移 + fix = varf[vid] + for vch in var[vid]: # 排版公式字符 + vc = chr(vch.cid) + ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ " # noqa: E501 if log.isEnabledFor(logging.DEBUG): - lstk.append(LTLine(0.1,(_x,_y),(x+vch.x0-var[vid][0].x0,fix+y+vch.y0-var[vid][0].y0))) - _x,_y=x+vch.x0-var[vid][0].x0,fix+y+vch.y0-var[vid][0].y0 - for l in varl[vid]: # 排版公式线条 - if l.linewidth<5: # hack - ops+=f"ET q 1 0 0 1 {l.pts[0][0]+x-var[vid][0].x0:f} {l.pts[0][1]+fix+y-var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0]-l.pts[0][0]:f} {l.pts[1][1]-l.pts[0][1]:f} l S Q BT " - else: # 插入文字缓冲区 - if not cstk: # 单行开头 - tx=x - if x==lt and ch==' ': # 消除段落换行空格 - adv=0 + lstk.append( + LTLine( + 0.1, + (_x, _y), + ( + x + vch.x0 - var[vid][0].x0, + fix + y + vch.y0 - var[vid][0].y0, + ), + ) + ) + _x, _y = ( + x + vch.x0 - var[vid][0].x0, + fix + y + vch.y0 - var[vid][0].y0, + ) + for l in varl[vid]: # 排版公式线条 # noqa: E741 + if l.linewidth < 5: # hack + ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT " # noqa: E501 + else: # 插入文字缓冲区 + if not cstk: # 单行开头 + tx = x + if x == lt and ch == " ": # 消除段落换行空格 + adv = 0 else: - cstk+=ch + cstk += ch else: - cstk+=ch - if mod: # 文字修饰符 - adv=0 - fcur=fcur_ - x+=adv + cstk += ch + if mod: # 文字修饰符 + adv = 0 + fcur = fcur_ + x += adv if log.isEnabledFor(logging.DEBUG): - lstk.append(LTLine(0.1,(_x,_y),(x,y))) - _x,_y=x,y - for l in lstk: # 排版全局线条 - if l.linewidth<5: # hack - ops+=f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0]-l.pts[0][0]:f} {l.pts[1][1]-l.pts[0][1]:f} l S Q BT " - ops=f'BT {ops}ET ' + lstk.append(LTLine(0.1, (_x, _y), (x, y))) + _x, _y = x, y + for l in lstk: # 排版全局线条 # noqa: E741 + if l.linewidth < 5: # hack + ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT " # noqa: E501 + ops = f"BT {ops}ET " return ops - ops=render(ltpage) + + ops = render(ltpage) return ops # Some dummy functions to save memory/CPU when all that is wanted diff --git a/pdf2zh/encodingdb.py b/pdf2zh/encodingdb.py index 877ea892..ee6a1061 100644 --- a/pdf2zh/encodingdb.py +++ b/pdf2zh/encodingdb.py @@ -120,7 +120,7 @@ def get_encoding( elif isinstance(x, PSLiteral): try: cid2unicode[cid] = name2unicode(cast(str, x.name)) - except (KeyError, ValueError) as e: + except (KeyError, ValueError): # log.debug(str(e)) pass cid += 1 diff --git a/pdf2zh/fontmetrics.py b/pdf2zh/fontmetrics.py index b6780b96..c95c1c1d 100644 --- a/pdf2zh/fontmetrics.py +++ b/pdf2zh/fontmetrics.py @@ -9,7 +9,7 @@ """ -### BEGIN Verbatim copy of the license part +# BEGIN Verbatim copy of the license part # # Adobe Core 35 AFM Files with 314 Glyph Entries - ReadMe @@ -24,7 +24,7 @@ # obligation to support the use of the AFM files. # -### END Verbatim copy of the license part +# END Verbatim copy of the license part # flake8: noqa from typing import Dict diff --git a/pdf2zh/gui.py b/pdf2zh/gui.py index b35117db..8b5b4a17 100644 --- a/pdf2zh/gui.py +++ b/pdf2zh/gui.py @@ -33,32 +33,32 @@ page_map = { "All": None, "First": [0], - "First 5 pages": list(range(0,5)), + "First 5 pages": list(range(0, 5)), } -flag_demo=False -if os.environ.get('PDF2ZH_DEMO'): - flag_demo=True +flag_demo = False +if os.environ.get("PDF2ZH_DEMO"): + flag_demo = True service_map = { "Google": "google", } page_map = { "First": [0], - "First 20 pages": list(range(0,20)), + "First 20 pages": list(range(0, 20)), } - client_key=os.environ.get('PDF2ZH_CLIENT_KEY') - server_key=os.environ.get('PDF2ZH_SERVER_KEY') + client_key = os.environ.get("PDF2ZH_CLIENT_KEY") + server_key = os.environ.get("PDF2ZH_SERVER_KEY") def verify_recaptcha(response): recaptcha_url = "https://www.google.com/recaptcha/api/siteverify" - print('reCAPTCHA',server_key,response) + print("reCAPTCHA", server_key, response) data = {"secret": server_key, "response": response} result = requests.post(recaptcha_url, data=data).json() - print('reCAPTCHA',result.get("success")) + print("reCAPTCHA", result.get("success")) return result.get("success") @@ -87,14 +87,20 @@ def upload_file(file, service, progress=gr.Progress()): def translate( - file_path, service, model_id, lang, page_range, recaptcha_response, progress=gr.Progress() + file_path, + service, + model_id, + lang, + page_range, + recaptcha_response, + progress=gr.Progress(), ): """Translate PDF content using selected service.""" if not file_path: - raise gr.Error('No input') + raise gr.Error("No input") if flag_demo and not verify_recaptcha(recaptcha_response): - raise gr.Error('reCAPTCHA fail') + raise gr.Error("reCAPTCHA fail") progress(0, desc="Starting translation...") @@ -113,30 +119,31 @@ def translate( lang_to = "zh-CN" if lang_to == "zh" else lang_to print(f"Files before translation: {os.listdir(output)}") - def progress_bar(t:tqdm.tqdm): - progress(t.n/t.total, desc="Translating...") - - param={ - 'files':[file_en], - 'pages':selected_page, - 'lang_in':'auto', - 'lang_out':lang_to, - 'service':f"{selected_service}:{model_id}", - 'output':output, - 'thread':4, - 'callback':progress_bar, - } + + def progress_bar(t: tqdm.tqdm): + progress(t.n / t.total, desc="Translating...") + + param = { + "files": [file_en], + "pages": selected_page, + "lang_in": "auto", + "lang_out": lang_to, + "service": f"{selected_service}:{model_id}", + "output": output, + "thread": 4, + "callback": progress_bar, + } print(param) extract_text(**param) print(f"Files after translation: {os.listdir(output)}") if not file_zh.exists() or not file_dual.exists(): - raise gr.Error('No output') + raise gr.Error("No output") try: translated_preview = pdf_preview(str(file_zh)) - except Exception as e: - raise gr.Error('No preview') + except Exception: + raise gr.Error("No preview") progress(1.0, desc="Translation complete!") @@ -175,7 +182,7 @@ def progress_bar(t:tqdm.tqdm): footer {visibility: hidden} .env-warning {color: #dd5500 !important;} .env-success {color: #559900 !important;} - + @keyframes pulse-background { 0% { background-color: #FFFFFF; } 25% { background-color: #FFFFFF; } @@ -183,7 +190,7 @@ def progress_bar(t:tqdm.tqdm): 75% { background-color: #FFFFFF; } 100% { background-color: #FFFFFF; } } - + /* Add dashed border to input-file class */ .input-file { border: 1.2px dashed #165DFF !important; @@ -232,7 +239,9 @@ def progress_bar(t:tqdm.tqdm): ''' if flag_demo else "" ) as demo: - gr.Markdown("# [PDFMathTranslate @ Github](https://github.com/Byaidu/PDFMathTranslate)") + gr.Markdown( + "# [PDFMathTranslate @ Github](https://github.com/Byaidu/PDFMathTranslate)" + ) with gr.Row(): with gr.Column(scale=1): @@ -265,14 +274,15 @@ def progress_bar(t:tqdm.tqdm): ) model_id = gr.Textbox( label="Model ID", - info="Please enter the identifier of the model you wish to use (e.g., gemma2). This identifier will be used to specify the particular model for translation.", + info="Please enter the identifier of the model you wish to use (e.g., gemma2). " + "This identifier will be used to specify the particular model for translation.", # value="gemma2", visible=False, # hide by default ) envs_status = "- Properly configured.
" def details_wrapper(text_markdown): - text = f""" + text = f"""
Technical details {text_markdown} @@ -287,7 +297,11 @@ def env_var_checker(env_var_name: str) -> str: not os.environ.get(env_var_name) or os.environ.get(env_var_name) == "" ): - envs_status = f"- Warning: environmental not found or error ({env_var_name}).
- Please make sure that the environment variables are properly configured (guide).
" + envs_status = ( + f"- Warning: environmental not found or error ({env_var_name})." + + "
- Please make sure that the environment variables are properly configured " + + "(guide).
" + ) else: value = str(os.environ.get(env_var_name)) envs_status = ( @@ -327,7 +341,11 @@ def on_select_service(value, evt: gr.EventData): ) # show model id when service is selected envs_status = env_var_checker("OLLAMA_HOST") else: - envs_status = "- Warning: model not in the list.
- Please report via (guide).
" + envs_status = ( + "- Warning: model not in the list." + "
- Please report via " + "(guide).
" + ) return envs_status, model_visibility output_title = gr.Markdown("## Translated", visible=False) @@ -381,20 +399,28 @@ def on_select_service(value, evt: gr.EventData): def setup_gui(share=False): - import doclayout_yolo # cache + import doclayout_yolo # cache # noqa: F401 + if flag_demo: - demo.launch(server_name="0.0.0.0", max_file_size='5mb', inbrowser=True) + demo.launch(server_name="0.0.0.0", max_file_size="5mb", inbrowser=True) else: try: demo.launch(server_name="0.0.0.0", debug=True, inbrowser=True, share=share) except Exception: - print("Error launching GUI using 0.0.0.0.\nThis may be caused by global mode of proxy software.") + print( + "Error launching GUI using 0.0.0.0.\nThis may be caused by global mode of proxy software." + ) try: - demo.launch(server_name="127.0.0.1", debug=True, inbrowser=True, share=share) + demo.launch( + server_name="127.0.0.1", debug=True, inbrowser=True, share=share + ) except Exception: - print("Error launching GUI using 127.0.0.1.\nThis may be caused by global mode of proxy software.") + print( + "Error launching GUI using 127.0.0.1.\nThis may be caused by global mode of proxy software." + ) demo.launch(debug=True, inbrowser=True, share=True) + # For auto-reloading while developing if __name__ == "__main__": setup_gui() diff --git a/pdf2zh/high_level.py b/pdf2zh/high_level.py index eeb980e0..940d5df9 100644 --- a/pdf2zh/high_level.py +++ b/pdf2zh/high_level.py @@ -46,7 +46,7 @@ def extract_text_to_fp( vchar: str = "", thread: int = 0, doc_en: Document = None, - model = None, + model=None, lang_in: str = "", lang_out: str = "", service: str = "", @@ -91,7 +91,7 @@ def extract_text_to_fp( rsrcmgr = PDFResourceManager(caching=not disable_caching) device: Optional[PDFDevice] = None - layout={} + layout = {} if output_type != "text" and outfp == sys.stdout: outfp = sys.stdout.buffer @@ -151,50 +151,68 @@ def extract_text_to_fp( raise PDFValueError(msg) assert device is not None - obj_patch={} + obj_patch = {} interpreter = PDFPageInterpreter(rsrcmgr, device, obj_patch) if pages: - total_pages=len(pages) + total_pages = len(pages) else: - total_pages=page_count - with tqdm.tqdm(PDFPage.get_pages( - inf, - pages, - maxpages=maxpages, - password=password, - caching=not disable_caching, - ), total=total_pages, position=0) as progress: + total_pages = page_count + with tqdm.tqdm( + PDFPage.get_pages( + inf, + pages, + maxpages=maxpages, + password=password, + caching=not disable_caching, + ), + total=total_pages, + position=0, + ) as progress: for page in progress: if callback: callback(progress) pix = doc_en[page.pageno].get_pixmap() - image = np.fromstring(pix.samples, np.uint8).reshape(pix.height, pix.width, 3)[:, :, ::-1] - page_layout=model.predict( + image = np.fromstring(pix.samples, np.uint8).reshape( + pix.height, pix.width, 3 + )[:, :, ::-1] + page_layout = model.predict( image, - imgsz=int(pix.height/32)*32, - device="cuda:0" if torch.cuda.is_available() else "cpu", # Auto-select GPU if available + imgsz=int(pix.height / 32) * 32, + device=( + "cuda:0" if torch.cuda.is_available() else "cpu" + ), # Auto-select GPU if available )[0] # kdtree 是不可能 kdtree 的,不如直接渲染成图片,用空间换时间 - box=np.ones((pix.height, pix.width)) - h,w=box.shape - vcls=['abandon','figure','table','isolate_formula','formula_caption'] - for i,d in enumerate(page_layout.boxes): + box = np.ones((pix.height, pix.width)) + h, w = box.shape + vcls = ["abandon", "figure", "table", "isolate_formula", "formula_caption"] + for i, d in enumerate(page_layout.boxes): if not page_layout.names[int(d.cls)] in vcls: - x0,y0,x1,y1=d.xyxy.squeeze() - x0,y0,x1,y1=np.clip(int(x0-1),0,w-1),np.clip(int(h-y1-1),0,h-1),np.clip(int(x1+1),0,w-1),np.clip(int(h-y0+1),0,h-1) - box[y0:y1,x0:x1]=i+2 - for i,d in enumerate(page_layout.boxes): + x0, y0, x1, y1 = d.xyxy.squeeze() + x0, y0, x1, y1 = ( + np.clip(int(x0 - 1), 0, w - 1), + np.clip(int(h - y1 - 1), 0, h - 1), + np.clip(int(x1 + 1), 0, w - 1), + np.clip(int(h - y0 + 1), 0, h - 1), + ) + box[y0:y1, x0:x1] = i + 2 + for i, d in enumerate(page_layout.boxes): if page_layout.names[int(d.cls)] in vcls: - x0,y0,x1,y1=d.xyxy.squeeze() - x0,y0,x1,y1=np.clip(int(x0-1),0,w-1),np.clip(int(h-y1-1),0,h-1),np.clip(int(x1+1),0,w-1),np.clip(int(h-y0+1),0,h-1) - box[y0:y1,x0:x1]=0 - layout[page.pageno]=box + x0, y0, x1, y1 = d.xyxy.squeeze() + x0, y0, x1, y1 = ( + np.clip(int(x0 - 1), 0, w - 1), + np.clip(int(h - y1 - 1), 0, h - 1), + np.clip(int(x1 + 1), 0, w - 1), + np.clip(int(h - y0 + 1), 0, h - 1), + ) + box[y0:y1, x0:x1] = 0 + layout[page.pageno] = box # print(page.number,page_layout) page.rotate = (page.rotate + rotation) % 360 # 新建一个 xref 存放新指令流 - page.page_xref = doc_en.get_new_xref() # hack + page.page_xref = doc_en.get_new_xref() # hack doc_en.update_object(page.page_xref, "<<>>") - doc_en.update_stream(page.page_xref,b'') + doc_en.update_stream(page.page_xref, b"") doc_en[page.pageno].set_contents(page.page_xref) interpreter.process_page(page) diff --git a/pdf2zh/layout.py b/pdf2zh/layout.py index 6327ba6c..09208562 100644 --- a/pdf2zh/layout.py +++ b/pdf2zh/layout.py @@ -368,7 +368,7 @@ def __init__( LTText.__init__(self) self._text = text self.matrix = matrix - self.font=font + self.font = font self.fontname = font.fontname self.ncs = ncs self.graphicstate = graphicstate @@ -387,7 +387,7 @@ def __init__( bbox_upper_right = (-vx + fontsize, vy + rise) else: # horizontal - descent = 0 # descent = font.get_descent() * fontsize + descent = 0 # descent = font.get_descent() * fontsize bbox_lower_left = (0, descent + rise) bbox_upper_right = (self.adv, descent + rise + fontsize) (a, b, c, d, e, f) = self.matrix @@ -405,7 +405,14 @@ def __init__( self.size = self.height def __repr__(self) -> str: - return f"<{self.__class__.__name__} {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)} font={self.fontname!r} adv={self.adv} text={self.get_text()!r}>" + return "<{} {} matrix={} font={} adv={} text={}>".format( + self.__class__.__name__, + bbox2str(self.bbox), + matrix2str(self.matrix), + repr(self.fontname), + self.adv, + repr(self.get_text()), + ) def get_text(self) -> str: return self._text diff --git a/pdf2zh/pdf2zh.py b/pdf2zh/pdf2zh.py index 88a71f0c..7153232c 100644 --- a/pdf2zh/pdf2zh.py +++ b/pdf2zh/pdf2zh.py @@ -119,7 +119,7 @@ def extract_text( doc_en.xref_set_key( xref, f"{label}Font/{font}", f"{font_id[font]} 0 R" ) - except: + except Exception: pass doc_en.save(Path(output) / f"{filename}-en.pdf") @@ -277,6 +277,7 @@ def main(args: Optional[List[str]] = None) -> int: return -1 if parsed_args.interactive: from pdf2zh.gui import setup_gui + setup_gui(parsed_args.share) return 0 diff --git a/pdf2zh/pdfdocument.py b/pdf2zh/pdfdocument.py index ac0c3272..535459eb 100644 --- a/pdf2zh/pdfdocument.py +++ b/pdf2zh/pdfdocument.py @@ -706,7 +706,7 @@ def __init__( try: # print('FIND XREF') pos = self.find_xref(parser) - self.pos=pos + self.pos = pos self.read_xref_from(parser, pos, self.xrefs) except PDFNoValidXRef: if fallback: diff --git a/pdf2zh/pdffont.py b/pdf2zh/pdffont.py index 32ccc9ed..5591e1e9 100644 --- a/pdf2zh/pdffont.py +++ b/pdf2zh/pdffont.py @@ -140,7 +140,7 @@ def get_encoding(self) -> Dict[int, str]: break try: self._cid2unicode[cid] = name2unicode(cast(str, name)) - except KeyError as e: + except KeyError: # log.debug(str(e)) pass return self._cid2unicode diff --git a/pdf2zh/pdfinterp.py b/pdf2zh/pdfinterp.py index 12306a1a..b9d23382 100644 --- a/pdf2zh/pdfinterp.py +++ b/pdf2zh/pdfinterp.py @@ -368,7 +368,9 @@ class PDFPageInterpreter: Reference: PDF Reference, Appendix A, Operator Summary """ - def __init__(self, rsrcmgr: PDFResourceManager, device: PDFDevice, obj_patch) -> None: + def __init__( + self, rsrcmgr: PDFResourceManager, device: PDFDevice, obj_patch + ) -> None: self.rsrcmgr = rsrcmgr self.device = device self.obj_patch = obj_patch @@ -407,7 +409,7 @@ def get_colorspace(spec: object) -> Optional[PDFColorSpace]: objid = spec.objid spec = dict_value(spec) self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec) - self.fontid[self.fontmap[fontid]]=fontid + self.fontid[self.fontmap[fontid]] = fontid elif k == "ColorSpace": for csid, spec in dict_value(v).items(): colorspace = get_colorspace(resolve1(spec)) @@ -570,16 +572,25 @@ def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, h: PDFStackT) -> None: def do_S(self) -> None: """Stroke path""" + def is_black(color: Color) -> bool: if isinstance(color, Tuple): - return sum(color)==0 + return sum(color) == 0 else: - return color==0 - if len(self.curpath)==2 and self.curpath[0][0]=='m' and self.curpath[1][0]=='l' and apply_matrix_pt(self.ctm,self.curpath[0][-2:])[1]==apply_matrix_pt(self.ctm,self.curpath[1][-2:])[1] and is_black(self.graphicstate.scolor): # 独立直线,水平,黑色 + return color == 0 + + if ( + len(self.curpath) == 2 + and self.curpath[0][0] == "m" + and self.curpath[1][0] == "l" + and apply_matrix_pt(self.ctm, self.curpath[0][-2:])[1] + == apply_matrix_pt(self.ctm, self.curpath[1][-2:])[1] + and is_black(self.graphicstate.scolor) + ): # 独立直线,水平,黑色 # print(apply_matrix_pt(self.ctm,self.curpath[0][-2:]),apply_matrix_pt(self.ctm,self.curpath[1][-2:]),self.graphicstate.scolor) self.device.paint_path(self.graphicstate, True, False, False, self.curpath) self.curpath = [] - return 'n' + return "n" else: self.curpath = [] @@ -698,7 +709,7 @@ def do_SCN(self) -> None: if settings.STRICT: raise PDFInterpreterError("No colorspace specified!") n = 1 - args=self.pop(n) + args = self.pop(n) self.graphicstate.scolor = cast(Color, args) return args @@ -710,7 +721,7 @@ def do_scn(self) -> None: if settings.STRICT: raise PDFInterpreterError("No colorspace specified!") n = 1 - args=self.pop(n) + args = self.pop(n) self.graphicstate.ncolor = cast(Color, args) return args @@ -963,22 +974,24 @@ def do_Do(self, xobjid_arg: PDFStackT) -> None: else: resources = self.resources.copy() self.device.begin_figure(xobjid, bbox, matrix) - ctm=mult_matrix(matrix, self.ctm) - ops_base=interpreter.render_contents( + ctm = mult_matrix(matrix, self.ctm) + ops_base = interpreter.render_contents( resources, [xobj], ctm=ctm, ) - try: # 有的时候 form 字体加不上这里会烂掉 - self.device.fontid=interpreter.fontid - self.device.fontmap=interpreter.fontmap - ops_new=self.device.end_figure(xobjid) - ctm_inv=np.linalg.inv(np.array(ctm[:4]).reshape(2,2)) - pos_inv=-np.mat(ctm[4:])*ctm_inv - a,b,c,d=ctm_inv.reshape(4).tolist() - e,f=pos_inv.tolist()[0] - self.obj_patch[self.xobjmap[xobjid].objid]=f'q {ops_base}Q {a} {b} {c} {d} {e} {f} cm {ops_new}' - except: + try: # 有的时候 form 字体加不上这里会烂掉 + self.device.fontid = interpreter.fontid + self.device.fontmap = interpreter.fontmap + ops_new = self.device.end_figure(xobjid) + ctm_inv = np.linalg.inv(np.array(ctm[:4]).reshape(2, 2)) + pos_inv = -np.mat(ctm[4:]) * ctm_inv + a, b, c, d = ctm_inv.reshape(4).tolist() + e, f = pos_inv.tolist()[0] + self.obj_patch[self.xobjmap[xobjid].objid] = ( + f"q {ops_base}Q {a} {b} {c} {d} {e} {f} cm {ops_new}" + ) + except Exception: pass elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj: self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY) @@ -1002,14 +1015,16 @@ def process_page(self, page: PDFPage) -> None: else: ctm = (1, 0, 0, 1, -x0, -y0) self.device.begin_page(page, ctm) - ops_base=self.render_contents(page.resources, page.contents, ctm=ctm) - self.device.fontid=self.fontid - self.device.fontmap=self.fontmap - ops_new=self.device.end_page(page) + ops_base = self.render_contents(page.resources, page.contents, ctm=ctm) + self.device.fontid = self.fontid + self.device.fontmap = self.fontmap + ops_new = self.device.end_page(page) # 上面渲染的时候会根据 cropbox 减掉页面偏移得到真实坐标,这里输出的时候需要用 cm 把页面偏移加回来 - self.obj_patch[page.page_xref]=f'q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}' # ops_base 里可能有图,需要让 ops_new 里的文字覆盖在上面,使用 q/Q 重置位置矩阵 + self.obj_patch[page.page_xref] = ( + f"q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}" # ops_base 里可能有图,需要让 ops_new 里的文字覆盖在上面,使用 q/Q 重置位置矩阵 + ) for obj in page.contents: - self.obj_patch[obj.objid]='' + self.obj_patch[obj.objid] = "" def render_contents( self, @@ -1032,7 +1047,7 @@ def render_contents( return self.execute(list_value(streams)) def execute(self, streams: Sequence[object]) -> None: - ops='' + ops = "" try: parser = PDFContentParser(streams) except PSEOF: @@ -1057,17 +1072,38 @@ def execute(self, streams: Sequence[object]) -> None: # log.debug("exec: %s %r", name, args) if len(args) == nargs: func(*args) - if not (name[0]=='T' or name in ['"',"'",'EI','MP','DP','BMC','BDC']): # 过滤 T 系列文字指令,因为 EI 的参数是 obj 所以也需要过滤(只在少数文档中画横线时使用),过滤 marked 系列指令 - p=" ".join([f'{x:f}' if isinstance(x,float) else str(x).replace("'","") for x in args]) - ops+=f'{p} {name} ' + if not ( + name[0] == "T" + or name in ['"', "'", "EI", "MP", "DP", "BMC", "BDC"] + ): # 过滤 T 系列文字指令,因为 EI 的参数是 obj 所以也需要过滤(只在少数文档中画横线时使用),过滤 marked 系列指令 + p = " ".join( + [ + ( + f"{x:f}" + if isinstance(x, float) + else str(x).replace("'", "") + ) + for x in args + ] + ) + ops += f"{p} {name} " else: # log.debug("exec: %s", name) - targs=func() - if targs==None: - targs=[] - if not (name[0]=='T' or name in ['BI','ID','EMC']): - p=" ".join([f'{x:f}' if isinstance(x,float) else str(x).replace("'","") for x in targs]) - ops+=f'{p} {name} ' + targs = func() + if targs is None: + targs = [] + if not (name[0] == "T" or name in ["BI", "ID", "EMC"]): + p = " ".join( + [ + ( + f"{x:f}" + if isinstance(x, float) + else str(x).replace("'", "") + ) + for x in targs + ] + ) + ops += f"{p} {name} " elif settings.STRICT: error_msg = "Unknown operator: %r" % name raise PDFInterpreterError(error_msg) diff --git a/pdf2zh/pdfpage.py b/pdf2zh/pdfpage.py index 2864f077..e6ac705d 100644 --- a/pdf2zh/pdfpage.py +++ b/pdf2zh/pdfpage.py @@ -188,7 +188,7 @@ def get_pages( log.warning(warning_msg) # Process each page contained in the document. for pageno, page in enumerate(cls.create_pages(doc)): - page.pageno=pageno + page.pageno = pageno if pagenos and (pageno not in pagenos): continue yield page diff --git a/pdf2zh/psparser.py b/pdf2zh/psparser.py index 7472e540..1249153c 100644 --- a/pdf2zh/psparser.py +++ b/pdf2zh/psparser.py @@ -580,7 +580,7 @@ def nextobject(self) -> PSStackEntry[ExtraT]: :return: keywords, literals, strings, numbers, arrays and dictionaries. """ - end=None + end = None while not self.results: (pos, token) = self.nexttoken() if isinstance(token, (int, float, bool, str, bytes, PSLiteral)): @@ -632,8 +632,8 @@ def nextobject(self) -> PSStackEntry[ExtraT]: # token, # self.curstack, # ) - if token.name==b'endobj': - end=pos+7 + if token.name == b"endobj": + end = pos + 7 self.do_keyword(pos, token) else: log.error( @@ -653,4 +653,4 @@ def nextobject(self) -> PSStackEntry[ExtraT]: # log.debug("nextobject: %r", obj) # except Exception: # log.debug("nextobject: (unprintable object)") - return end,obj + return end, obj diff --git a/pdf2zh/translator.py b/pdf2zh/translator.py index 173bade0..58348608 100644 --- a/pdf2zh/translator.py +++ b/pdf2zh/translator.py @@ -19,11 +19,7 @@ def __init__(self, service, lang_out, lang_in, model): self.lang_in = lang_in self.model = model - def translate(self, text) -> str: - ... - - def __str__(self): - pass + def translate(self, text) -> str: ... # noqa: E704 def __str__(self): return f"{self.service} {self.lang_out} {self.lang_in}" @@ -37,7 +33,7 @@ def __init__(self, service, lang_out, lang_in, model): self.session = requests.Session() self.base_link = "http://translate.google.com/m" self.headers = { - "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" + "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501 } def translate(self, text): @@ -51,7 +47,7 @@ def translate(self, text): r'(?s)class="(?:t0|result-container)">(.*?)<', response.text ) if response.status_code == 400: - result = 'IRREPARABLE TRANSLATION ERROR' + result = "IRREPARABLE TRANSLATION ERROR" elif len(re_result) == 0: raise ValueError("Empty translation result") else: @@ -80,7 +76,7 @@ def __init__(self, service, lang_out, lang_in, model): self.session = requests.Session() self.base_link = f"{server_url}/{auth_key}/translate" self.headers = { - "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" + "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501 } def translate(self, text): @@ -115,27 +111,25 @@ def translate(self, text): class DeepLTranslator(BaseTranslator): def __init__(self, service, lang_out, lang_in, model): - lang_out='ZH' if lang_out=='auto' else lang_out - lang_in='EN' if lang_in=='auto' else lang_in + lang_out = "ZH" if lang_out == "auto" else lang_out + lang_in = "EN" if lang_in == "auto" else lang_in super().__init__(service, lang_out, lang_in, model) self.session = requests.Session() - auth_key = os.getenv('DEEPL_AUTH_KEY') - server_url = os.getenv('DEEPL_SERVER_URL') + auth_key = os.getenv("DEEPL_AUTH_KEY") + server_url = os.getenv("DEEPL_SERVER_URL") self.client = deepl.Translator(auth_key, server_url=server_url) def translate(self, text): response = self.client.translate_text( - text, - target_lang=self.lang_out, - source_lang=self.lang_in + text, target_lang=self.lang_out, source_lang=self.lang_in ) return response.text class OllamaTranslator(BaseTranslator): def __init__(self, service, lang_out, lang_in, model): - lang_out='zh-CN' if lang_out=='auto' else lang_out - lang_in='en' if lang_in=='auto' else lang_in + lang_out = "zh-CN" if lang_out == "auto" else lang_out + lang_in = "en" if lang_in == "auto" else lang_in super().__init__(service, lang_out, lang_in, model) self.options = {"temperature": 0} # 随机采样可能会打断公式标记 # OLLAMA_HOST @@ -152,16 +146,17 @@ def translate(self, text): }, { "role": "user", - "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", + "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501 }, ], ) return response["message"]["content"].strip() + class OpenAITranslator(BaseTranslator): def __init__(self, service, lang_out, lang_in, model): - lang_out='zh-CN' if lang_out=='auto' else lang_out - lang_in='en' if lang_in=='auto' else lang_in + lang_out = "zh-CN" if lang_out == "auto" else lang_out + lang_in = "en" if lang_in == "auto" else lang_in super().__init__(service, lang_out, lang_in, model) self.options = {"temperature": 0} # 随机采样可能会打断公式标记 # OPENAI_BASE_URL @@ -179,7 +174,7 @@ def translate(self, text) -> str: }, { "role": "user", - "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", + "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501 }, ], ) @@ -188,8 +183,8 @@ def translate(self, text) -> str: class AzureTranslator(BaseTranslator): def __init__(self, service, lang_out, lang_in, model): - lang_out='zh-Hans' if lang_out=='auto' else lang_out - lang_in='en' if lang_in=='auto' else lang_in + lang_out = "zh-Hans" if lang_out == "auto" else lang_out + lang_in = "en" if lang_in == "auto" else lang_in super().__init__(service, lang_out, lang_in, model) try: @@ -198,7 +193,9 @@ def __init__(self, service, lang_out, lang_in, model): region = os.environ["AZURE_REGION"] except KeyError as e: missing_var = e.args[0] - raise ValueError(f"The environment variable '{missing_var}' is required but not set.") from e + raise ValueError( + f"The environment variable '{missing_var}' is required but not set." + ) from e credential = AzureKeyCredential(api_key) self.client = TextTranslationClient( diff --git a/pdf2zh/utils.py b/pdf2zh/utils.py index f76d78e3..25697fdf 100644 --- a/pdf2zh/utils.py +++ b/pdf2zh/utils.py @@ -284,9 +284,11 @@ def apply_matrix_norm(m: Matrix, v: Point) -> Point: (p, q) = v return a * p + c * q, b * p + d * q + def matrix_scale(m: Matrix) -> float: (a, b, c, d, e, f) = m - return (a**2+c**2)**0.5 + return (a**2 + c**2) ** 0.5 + # Utility functions diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 00000000..2b3e5804 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,46 @@ +[project] +name = "pdf2zh" +version = "1.7.9" +description = "Latex PDF Translator" +authors = [{ name = "Byaidu", email = "byaidux@gmail.com" }] +license = "AGPL-3.0" +readme = "README.md" +requires-python = ">=3.8,<3.13" +classifiers = [ + "Programming Language :: Python :: 3", + "Operating System :: OS Independent", +] +dependencies = [ + "charset-normalizer", + "cryptography", + "requests", + "pymupdf", + "tqdm", + "tenacity", + "doclayout-yolo", + "numpy", + "ollama", + "deepl<1.19.1", + "openai", + "azure-ai-translation-text<=1.0.1", + "gradio", + "huggingface_hub", + "torch", +] + +[project.optional-dependencies] +dev = [ + "black", + "flake8", + "pre-commit" +] + +[project.urls] +Homepage = "https://github.com/Byaidu/PDFMathTranslate" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project.scripts] +pdf2zh = "pdf2zh.pdf2zh:main" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 0cc9bdcf..00000000 --- a/requirements.txt +++ /dev/null @@ -1,15 +0,0 @@ -charset-normalizer -cryptography -requests -pymupdf -tqdm -tenacity -doclayout-yolo -numpy -ollama -deepl<1.19.1 -openai -azure-ai-translation-text<=1.0.1 -gradio -huggingface_hub -torch \ No newline at end of file diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 00000000..053bd42e --- /dev/null +++ b/setup.cfg @@ -0,0 +1,4 @@ +[flake8] +max-line-length = 120 +ignore = E203,W503,E261 +exclude = .git,build,dist,docs \ No newline at end of file diff --git a/setup.py b/setup.py deleted file mode 100644 index 57225f72..00000000 --- a/setup.py +++ /dev/null @@ -1,31 +0,0 @@ -from pdf2zh import __version__, __author__ -from setuptools import setup - -with open("README.md", encoding='utf-8') as f: - readme = f.read() - -with open("requirements.txt", encoding='utf-8') as f: - requirements = f.readlines() - -setup( - name="pdf2zh", - long_description=readme, - long_description_content_type="text/markdown", - description="Latex PDF Translator", - license="AGPLv3", - version=__version__, - author=__author__, - author_email="byaidux@gmail.com", - url="https://github.com/Byaidu/PDFMathTranslate", - packages=["pdf2zh"], - install_requires=requirements, - classifiers=[ - "Programming Language :: Python :: 3", - "Operating System :: OS Independent", - ], - entry_points={ - 'console_scripts': [ - 'pdf2zh=pdf2zh.pdf2zh:main', - ] - }, -)