diff --git a/.github/workflows/python-build.yml b/.github/workflows/python-build.yml
new file mode 100644
index 00000000..11433830
--- /dev/null
+++ b/.github/workflows/python-build.yml
@@ -0,0 +1,31 @@
+name: Build Python Package
+
+on:
+ push:
+ branches:
+ - main
+ pull_request:
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+
+ steps:
+ - uses: actions/checkout@v4
+ - name: Set up Python
+ uses: actions/setup-python@v3
+ with:
+ python-version: '3.x'
+
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install build flake8 black
+
+ - name: Check code format
+ run: |
+ black --check --diff --color pdf2zh/*.py
+ flake8
+
+ - name: Build package
+ run: python -m build
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
new file mode 100644
index 00000000..43d3f66d
--- /dev/null
+++ b/.pre-commit-config.yaml
@@ -0,0 +1,14 @@
+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+files: '^.*\.py$'
+repos:
+- repo: local
+ hooks:
+ - id: black
+ name: black
+ entry: black --check --diff --color
+ language: python
+ - id: flake8
+ name: flake8
+ entry: flake8
+ language: python
diff --git a/README.md b/README.md
index 05b11657..c1ce5e8f 100644
--- a/README.md
+++ b/README.md
@@ -37,13 +37,15 @@ Feel free to provide feedback in [GitHub Issues](https://github.com/Byaidu/PDFMa
Updates
+- [Nov. 23 2024] Firewall for preventing web bots *(by [@Byaidu](https://github.com/Byaidu))*
+- [Nov. 22 2024] GUI now supports Italian, and has been improved *(by [@Byaidu](https://github.com/Byaidu), [@reycn](https://github.com/reycn))*
+- [Nov. 22 2024] You can now share your deployed service to others *(by [@Zxis233](https://github.com/Zxis233))*
+- [Nov. 22 2024] Now supportsTencent Translation *(by [@hellofinch](https://github.com/hellofinch))*
- [Nov. 21 2024] GUI now supports downloading dual-document *(by [@reycn](https://github.com/reycn))*
- [Nov. 20 2024] GUI now supports specifying Ollama and OpenAI models *(by [@IuvenisSapiens](https://github.com/IuvenisSapiens), [@Byaidu](https://github.com/Byaidu))*
- [Nov. 20 2024] 🌟 [Demo](#demo) online! *(by [@reycn](https://github.com/reycn))*
- [Nov. 20 2024] Supports [Docker](#docker) *(by [@Byaidu](https://github.com/Byaidu))*
- [Nov. 20 2024] Supports [multiple-threads translation](#threads) *(by [@Byaidu](https://github.com/Byaidu))*
-- [Nov. 19 2024] Provides an [interactive graphical user interface](#gui) *(by [@reycn](https://github.com/reycn))*
-- [Nov. 18 2024] Supports [more services: DeepL, DeepLX, and Azure](#services) *(by [@reycn](https://github.com/reycn), [@Hanaasagi](https://github.com/Hanaasagi))*
Preview
diff --git a/README_zh-CN.md b/README_zh-CN.md
index a5c7b866..cde585a1 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -37,13 +37,16 @@
近期更新
+
+- [Nov. 23 2024] 防止网页爬虫的防火墙 *(by [@Byaidu](https://github.com/Byaidu))*
+- [Nov. 22 2024] 图形用户界面现已支持意大利语,并获得了一些更新 *(by [@Byaidu](https://github.com/Byaidu), [@reycn](https://github.com/reycn))*
+- [Nov. 22 2024] 现在你可以将自己部署的服务分享给朋友了 *(by [@Zxis233](https://github.com/Zxis233))*
+- [Nov. 22 2024] Now supportsTencent Translation *(by [@hellofinch](https://github.com/hellofinch))*
- [Nov. 21 2024] 图形用户界面现在支持下载双语文档 *(by [@reycn](https://github.com/reycn))*
- [Nov. 20 2024] 图形用户界面现在支持指定 Ollama 和 OpenAI 的模型 *(by [@IuvenisSapiens](https://github.com/IuvenisSapiens), [@Byaidu](https://github.com/Byaidu))*
- [Nov. 20 2024] 🌟 提供了 [在线演示](#demo)! *(by [@reycn](https://github.com/reycn))*
- [Nov. 20 2024] 支持 [容器化部署](#docker) *(by [@Byaidu](https://github.com/Byaidu))*
-- [Nov. 20 2024] 支持速度更快的 [多线程翻译](#threads) *(by [@Byaidu](https://github.com/Byaidu))*
-- [Nov. 19 2024] 提供了[图形用户界面](#gui) *(by [@reycn](https://github.com/reycn))*
-- [Nov. 18 2024] 支持更多翻译服务,包含 [DeepL, DeepLX, 和 Azure](#services) *(by [@reycn](https://github.com/reycn), [@Hanaasagi](https://github.com/Hanaasagi))*
+- [Nov. 20 2024] 支持速度更快的 [多线程翻译](#threads) *(by [@Byaidu](https://github.com/Byaidu))*
效果预览
diff --git a/pdf2zh/cache.py b/pdf2zh/cache.py
index 3e45b41c..275cd712 100644
--- a/pdf2zh/cache.py
+++ b/pdf2zh/cache.py
@@ -3,9 +3,10 @@
import time
import hashlib
import shutil
-cache_dir = os.path.join(tempfile.gettempdir(), 'cache')
+
+cache_dir = os.path.join(tempfile.gettempdir(), "cache")
os.makedirs(cache_dir, exist_ok=True)
-time_filename = 'update_time'
+time_filename = "update_time"
max_cache = 5
@@ -16,25 +17,30 @@ def deterministic_hash(obj):
def get_dirs():
- dirs = [os.path.join(cache_dir, dir) for dir in os.listdir(cache_dir) if os.path.isdir(os.path.join(cache_dir, dir))]
+ dirs = [
+ os.path.join(cache_dir, dir)
+ for dir in os.listdir(cache_dir)
+ if os.path.isdir(os.path.join(cache_dir, dir))
+ ]
return dirs
def get_time(dir):
try:
timefile = os.path.join(dir, time_filename)
- t = float(open(timefile, encoding='utf-8').read())
+ t = float(open(timefile, encoding="utf-8").read())
return t
except FileNotFoundError:
# handle the error as needed, for now we'll just return a default value
- return float('inf') # This ensures that this directory will be the first to be removed if required
-
+ return float(
+ "inf"
+ ) # This ensures that this directory will be the first to be removed if required
def write_time(dir):
timefile = os.path.join(dir, time_filename)
t = time.time()
- print(t, file=open(timefile, "w", encoding='utf-8'), end='')
+ print(t, file=open(timefile, "w", encoding="utf-8"), end="")
def argmin(iterable):
@@ -44,7 +50,9 @@ def argmin(iterable):
def remove_extra():
dirs = get_dirs()
for dir in dirs:
- if not os.path.isdir(dir): # This line might be redundant now, as get_dirs() ensures only directories are returned
+ if not os.path.isdir(
+ dir
+ ): # This line might be redundant now, as get_dirs() ensures only directories are returned
os.remove(dir)
try:
get_time(dir)
@@ -73,11 +81,11 @@ def create_cache(hash_key):
def load_paragraph(hash_key, hash_key_paragraph):
filename = os.path.join(cache_dir, hash_key, hash_key_paragraph)
if os.path.exists(filename):
- return open(filename, encoding='utf-8').read()
+ return open(filename, encoding="utf-8").read()
else:
return None
def write_paragraph(hash_key, hash_key_paragraph, paragraph):
filename = os.path.join(cache_dir, hash_key, hash_key_paragraph)
- print(paragraph, file=open(filename, "w", encoding='utf-8'), end='')
\ No newline at end of file
+ print(paragraph, file=open(filename, "w", encoding="utf-8"), end="")
diff --git a/pdf2zh/converter.py b/pdf2zh/converter.py
index 587159c3..70594ffc 100644
--- a/pdf2zh/converter.py
+++ b/pdf2zh/converter.py
@@ -1,3 +1,45 @@
+from pdf2zh.utils import (
+ AnyIO,
+ Matrix,
+ PathSegment,
+ Point,
+ Rect,
+ apply_matrix_pt,
+ bbox2str,
+ enc,
+ make_compat_str,
+ mult_matrix,
+ matrix_scale,
+)
+from pdf2zh.pdftypes import PDFStream
+from pdf2zh.pdfpage import PDFPage
+from pdf2zh.pdfinterp import PDFGraphicState, PDFResourceManager
+from pdf2zh.pdffont import PDFFont, PDFUnicodeNotDefined, PDFCIDFont
+from pdf2zh.pdfexceptions import PDFValueError
+from pdf2zh.pdfdevice import PDFTextDevice
+from pdf2zh.pdfcolor import PDFColorSpace
+from pdf2zh.layout import (
+ LAParams,
+ LTAnno,
+ LTChar,
+ LTComponent,
+ LTCurve,
+ LTFigure,
+ LTImage,
+ LTItem,
+ LTLayoutContainer,
+ LTLine,
+ LTPage,
+ LTRect,
+ LTText,
+ LTTextBox,
+ LTTextBoxVertical,
+ LTTextGroup,
+ LTTextLine,
+ TextGroupElement,
+)
+from pdf2zh.image import ImageWriter
+from pdf2zh import utils
import io
import logging
import re
@@ -28,55 +70,15 @@
OpenAITranslator,
AzureTranslator,
)
+
+
def remove_control_characters(s):
- return "".join(ch for ch in s if unicodedata.category(ch)[0]!="C")
+ return "".join(ch for ch in s if unicodedata.category(ch)[0] != "C")
-from pdf2zh import utils
-from pdf2zh.image import ImageWriter
-from pdf2zh.layout import (
- LAParams,
- LTAnno,
- LTChar,
- LTComponent,
- LTContainer,
- LTCurve,
- LTFigure,
- LTImage,
- LTItem,
- LTLayoutContainer,
- LTLine,
- LTPage,
- LTRect,
- LTText,
- LTTextBox,
- LTTextBoxVertical,
- LTTextGroup,
- LTTextLine,
- TextGroupElement,
-)
-from pdf2zh.pdfcolor import PDFColorSpace
-from pdf2zh.pdfdevice import PDFTextDevice
-from pdf2zh.pdfexceptions import PDFValueError
-from pdf2zh.pdffont import PDFFont, PDFUnicodeNotDefined, PDFCIDFont
-from pdf2zh.pdfinterp import PDFGraphicState, PDFResourceManager
-from pdf2zh.pdfpage import PDFPage
-from pdf2zh.pdftypes import PDFStream
-from pdf2zh.utils import (
- AnyIO,
- Matrix,
- PathSegment,
- Point,
- Rect,
- apply_matrix_pt,
- bbox2str,
- enc,
- make_compat_str,
- mult_matrix,
- matrix_scale,
-)
log = logging.getLogger(__name__)
+
class PDFLayoutAnalyzer(PDFTextDevice):
cur_item: LTLayoutContainer
ctm: Matrix
@@ -188,7 +190,7 @@ def paint_path(
# Note: 'ml', in conditional above, is a frequent anomaly
# that we want to support.
line = LTLine(
- gstate.linewidth*matrix_scale(self.ctm),
+ gstate.linewidth * matrix_scale(self.ctm),
pts[0],
pts[1],
stroke,
@@ -210,7 +212,7 @@ def paint_path(
) or (y0 == y1 and x1 == x2 and y2 == y3 and x3 == x0)
if is_closed_loop and has_square_coordinates:
rect = LTRect(
- gstate.linewidth*matrix_scale(self.ctm),
+ gstate.linewidth * matrix_scale(self.ctm),
(*pts[0], *pts[2]),
stroke,
fill,
@@ -223,7 +225,7 @@ def paint_path(
self.cur_item.add(rect)
else:
curve = LTCurve(
- gstate.linewidth*matrix_scale(self.ctm),
+ gstate.linewidth * matrix_scale(self.ctm),
pts,
stroke,
fill,
@@ -236,7 +238,7 @@ def paint_path(
self.cur_item.add(curve)
else:
curve = LTCurve(
- gstate.linewidth*matrix_scale(self.ctm),
+ gstate.linewidth * matrix_scale(self.ctm),
pts,
stroke,
fill,
@@ -279,7 +281,7 @@ def render_char(
graphicstate,
)
self.cur_item.add(item)
- item.cid=cid # hack
+ item.cid = cid # hack
return item.adv
def handle_undefined_char(self, font: PDFFont, cid: int) -> str:
@@ -355,7 +357,7 @@ def __init__(
vfont: str = None,
vchar: str = None,
thread: int = 0,
- layout = {},
+ layout={},
lang_in: str = "",
lang_out: str = "",
service: str = "",
@@ -367,7 +369,7 @@ def __init__(
self.vchar = vchar
self.thread = thread
self.layout = layout
- param=service.split(':',1)
+ param = service.split(":", 1)
if param[0] == "google":
self.translator: BaseTranslator = GoogleTranslator(
service, lang_out, lang_in, None
@@ -384,11 +386,11 @@ def __init__(
self.translator: BaseTranslator = OllamaTranslator(
service, lang_out, lang_in, param[1]
)
- elif param[0] == 'openai':
+ elif param[0] == "openai":
self.translator: BaseTranslator = OpenAITranslator(
service, lang_out, lang_in, param[1]
)
- elif param[0] == 'azure':
+ elif param[0] == "azure":
self.translator: BaseTranslator = AzureTranslator(
service, lang_out, lang_in, None
)
@@ -404,173 +406,255 @@ def write_text(self, text: str) -> None:
def receive_layout(self, ltpage: LTPage):
def render(item: LTItem) -> None:
- xt=None # 上一个字符
- sstk=[] # 段落文字栈
- vstk=[] # 公式符号组
- vlstk=[] # 公式线条组
- vfix=0 # 公式纵向偏移
- vbkt=0 # 段落公式括号计数
- pstk=[] # 段落属性栈
- lstk=[] # 全局线条栈
- var=[] # 公式符号组栈
- varl=[] # 公式线条组栈
- varf=[] # 公式纵向偏移栈
- vlen=[] # 公式宽度栈
- xt_cls=-1 # 上一个字符所属段落
- vmax=ltpage.width/4 # 行内公式最大宽度
- ops="" # 渲染结果
- def vflag(font,char): # 匹配公式(和角标)字体
- if re.match(r'\(cid:',char):
+ xt = None # 上一个字符
+ sstk = [] # 段落文字栈
+ vstk = [] # 公式符号组
+ vlstk = [] # 公式线条组
+ vfix = 0 # 公式纵向偏移
+ vbkt = 0 # 段落公式括号计数
+ pstk = [] # 段落属性栈
+ lstk = [] # 全局线条栈
+ var = [] # 公式符号组栈
+ varl = [] # 公式线条组栈
+ varf = [] # 公式纵向偏移栈
+ vlen = [] # 公式宽度栈
+ xt_cls = -1 # 上一个字符所属段落
+ vmax = ltpage.width / 4 # 行内公式最大宽度
+ ops = "" # 渲染结果
+
+ def vflag(font, char): # 匹配公式(和角标)字体
+ if re.match(r"\(cid:", char):
return True
if self.vfont:
- if re.match(self.vfont,font):
+ if re.match(self.vfont, font):
return True
else:
- if re.match(r'(CM[^R]|MS|XY|MT|BL|RM|EU|LA|RS|LINE|TeX-|rsfs|txsy|wasy|.*Mono|.*Code|.*Ital|.*Sym)',font):
+ if re.match(
+ r"(CM[^R]|MS|XY|MT|BL|RM|EU|LA|RS|LINE|TeX-|rsfs|txsy|wasy|.*Mono|.*Code|.*Ital|.*Sym)",
+ font,
+ ):
return True
if self.vchar:
- if re.match(self.vchar,char):
+ if re.match(self.vchar, char):
return True
else:
- if char and char!=' ' and (unicodedata.category(char[0]) in ['Lm','Mn','Sk','Sm','Zl','Zp','Zs'] or ord(char[0]) in range(0x370,0x400)): # 文字修饰符、数学符号、分隔符号、希腊字母
+ if (
+ char
+ and char != " "
+ and (
+ unicodedata.category(char[0])
+ in ["Lm", "Mn", "Sk", "Sm", "Zl", "Zp", "Zs"]
+ or ord(char[0]) in range(0x370, 0x400)
+ )
+ ): # 文字修饰符、数学符号、分隔符号、希腊字母
return True
return False
- ptr=0
- item=list(item)
- while ptrvmax and cls!=0): # 公式结束、段落边界、公式换行
- if vstk: # 公式出栈
- sstk[-1]+=f'$v{len(var)}$'
- if not cur_v and cls==xt_cls and child.x0>max([vch.x0 for vch in vstk]): # and child.y1>vstk[0].y0: # 段落内公式转文字,行内公式修正
- vfix=vstk[0].y0-child.y0
+ # ops+=f'ET [] 0 d 0 J 0.1 w {child.x0:f}
+ # {child.y0:f} {child.x1-child.x0:f} {child.y1-child.y0:f} re S Q BT '
+ if (
+ cls == 0
+ or (cls == xt_cls and child.size < pstk[-1][4] * 0.79)
+ or vflag(fontname, child.get_text())
+ or (child.matrix[0] == 0 and child.matrix[3] == 0)
+ ): # 有 0.76 的角标和 0.799 的大写,这里用 0.79 取中
+ cur_v = True
+ if not cur_v: # 判定括号组是否属于公式
+ if vstk and child.get_text() == "(":
+ cur_v = True
+ vbkt += 1
+ if vbkt and child.get_text() == ")":
+ cur_v = True
+ vbkt -= 1
+ if (
+ not cur_v
+ or cls != xt_cls
+ or (abs(child.x0 - xt.x0) > vmax and cls != 0)
+ ): # 公式结束、段落边界、公式换行
+ if vstk: # 公式出栈
+ sstk[-1] += f"$v{len(var)}$"
+ if (
+ not cur_v
+ and cls == xt_cls
+ and child.x0 > max([vch.x0 for vch in vstk])
+ ): # and child.y1>vstk[0].y0: # 段落内公式转文字,行内公式修正
+ vfix = vstk[0].y0 - child.y0
var.append(vstk)
varl.append(vlstk)
varf.append(vfix)
- vstk=[]
- vlstk=[]
- vfix=0
- if not vstk: # 非公式或是公式开头
- if cls==xt_cls: # 同一段落
- if child.x0 > xt.x1 + 1: # 行内空格
- sstk[-1]+=' '
- elif child.x1 < xt.x0: # 换行空格
- sstk[-1]+=' '
- pstk[-1][6]=True # 标记原文段落存在换行
+ vstk = []
+ vlstk = []
+ vfix = 0
+ if not vstk: # 非公式或是公式开头
+ if cls == xt_cls: # 同一段落
+ if child.x0 > xt.x1 + 1: # 行内空格
+ sstk[-1] += " "
+ elif child.x1 < xt.x0: # 换行空格
+ sstk[-1] += " "
+ pstk[-1][6] = True # 标记原文段落存在换行
else:
sstk.append("")
- pstk.append([child.y0,child.x0,child.x0,child.x0,child.size,child.font,False])
- if not cur_v: # 文字入栈
- if child.size>pstk[-1][4]/0.79 or vflag(pstk[-1][5].fontname.split('+')[-1],'') or re.match(r'(.*Medi|.*Bold)',pstk[-1][5].fontname.split('+')[-1],re.IGNORECASE): # 小字体、公式或粗体开头,后续接文字,需要校正字体
- pstk[-1][0]-=child.size-pstk[-1][4]
- pstk[-1][4]=child.size
- pstk[-1][5]=child.font
- sstk[-1]+=child.get_text()
- else: # 公式入栈
- if not vstk and cls==xt_cls and child.x0>xt.x0: # and child.y1>xt.y0: # 段落内文字转公式,行内公式修正
- vfix=child.y0-xt.y0
+ pstk.append(
+ [
+ child.y0,
+ child.x0,
+ child.x0,
+ child.x0,
+ child.size,
+ child.font,
+ False,
+ ]
+ )
+ if not cur_v: # 文字入栈
+ if (
+ child.size > pstk[-1][4] / 0.79
+ or vflag(pstk[-1][5].fontname.split("+")[-1], "")
+ or re.match(
+ r"(.*Medi|.*Bold)",
+ pstk[-1][5].fontname.split("+")[-1],
+ re.IGNORECASE,
+ )
+ ): # 小字体、公式或粗体开头,后续接文字,需要校正字体
+ pstk[-1][0] -= child.size - pstk[-1][4]
+ pstk[-1][4] = child.size
+ pstk[-1][5] = child.font
+ sstk[-1] += child.get_text()
+ else: # 公式入栈
+ if (
+ not vstk and cls == xt_cls and child.x0 > xt.x0
+ ): # and child.y1>xt.y0: # 段落内文字转公式,行内公式修正
+ vfix = child.y0 - xt.y0
vstk.append(child)
# 更新段落边界,段落内换行之后可能是公式开头
- pstk[-1][2]=min(pstk[-1][2],child.x0)
- pstk[-1][3]=max(pstk[-1][3],child.x1)
- xt=child
- xt_cls=cls
- elif isinstance(child, LTFigure): # 图表
+ pstk[-1][2] = min(pstk[-1][2], child.x0)
+ pstk[-1][3] = max(pstk[-1][3], child.x1)
+ xt = child
+ xt_cls = cls
+ elif isinstance(child, LTFigure): # 图表
pass
- elif isinstance(child, LTLine): # 线条
- layout=self.layout[ltpage.pageid]
- h,w=layout.shape # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
- cx,cy=np.clip(int(child.x0),0,w-1),np.clip(int(child.y0),0,h-1)
- cls=layout[cy,cx]
- if vstk and cls==xt_cls: # 公式线条
+ elif isinstance(child, LTLine): # 线条
+ layout = self.layout[ltpage.pageid]
+ h, w = (
+ layout.shape
+ ) # ltpage.height 可能是 fig 里面的高度,这里统一用 layout.shape
+ cx, cy = np.clip(int(child.x0), 0, w - 1), np.clip(
+ int(child.y0), 0, h - 1
+ )
+ cls = layout[cy, cx]
+ if vstk and cls == xt_cls: # 公式线条
vlstk.append(child)
- else: # 全局线条
+ else: # 全局线条
lstk.append(child)
else:
# print(child)
pass
- ptr+=1
+ ptr += 1
# 处理结尾
- if vstk: # 公式出栈
- sstk[-1]+=f'$v{len(var)}$'
+ if vstk: # 公式出栈
+ sstk[-1] += f"$v{len(var)}$"
var.append(vstk)
varl.append(vlstk)
varf.append(vfix)
- log.debug('\n==========[VSTACK]==========\n')
- for id,v in enumerate(var): # 计算公式宽度
- l=max([vch.x1 for vch in v])-v[0].x0
- log.debug(f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}')
+ log.debug("\n==========[VSTACK]==========\n")
+ for id, v in enumerate(var): # 计算公式宽度
+ l = max([vch.x1 for vch in v]) - v[0].x0 # noqa: E741
+ log.debug(
+ f'< {l:.1f} {v[0].x0:.1f} {v[0].y0:.1f} {v[0].cid} {v[0].fontname} {len(varl[id])} > $v{id}$ = {"".join([ch.get_text() for ch in v])}' # noqa: E501
+ )
vlen.append(l)
- log.debug('\n==========[SSTACK]==========\n')
- hash_key=cache.deterministic_hash("PDFMathTranslate")
+ log.debug("\n==========[SSTACK]==========\n")
+ hash_key = cache.deterministic_hash("PDFMathTranslate")
cache.create_cache(hash_key)
+
@retry(wait=wait_fixed(1))
- def worker(s): # 多线程翻译
+ def worker(s): # 多线程翻译
try:
- hash_key_paragraph = cache.deterministic_hash((s,str(self.translator)))
- new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存
+ hash_key_paragraph = cache.deterministic_hash(
+ (s, str(self.translator))
+ )
+ new = cache.load_paragraph(hash_key, hash_key_paragraph) # 查询缓存
if new is None:
- new=self.translator.translate(s)
- new=remove_control_characters(new)
+ new = self.translator.translate(s)
+ new = remove_control_characters(new)
cache.write_paragraph(hash_key, hash_key_paragraph, new)
return new
except BaseException as e:
if log.isEnabledFor(logging.DEBUG):
log.exception(e)
else:
- log.exception(e,exc_info=False)
+ log.exception(e, exc_info=False)
raise e
- with concurrent.futures.ThreadPoolExecutor(max_workers=self.thread) as executor:
+
+ with concurrent.futures.ThreadPoolExecutor(
+ max_workers=self.thread
+ ) as executor:
news = list(executor.map(worker, sstk))
- def raw_string(fcur,cstk): # 编码字符串
- if isinstance(self.fontmap[fcur],PDFCIDFont): # 判断编码长度
+
+ def raw_string(fcur, cstk): # 编码字符串
+ if isinstance(self.fontmap[fcur], PDFCIDFont): # 判断编码长度
return "".join(["%04x" % ord(c) for c in cstk])
else:
return "".join(["%02x" % ord(c) for c in cstk])
- _x,_y=0,0
- for id,new in enumerate(news): # 排版文字和公式
- tx=x=pstk[id][1];y=pstk[id][0];lt=pstk[id][2];rt=pstk[id][3];ptr=0;size=pstk[id][4];font=pstk[id][5];lb=pstk[id][6] # 段落属性
- cstk='' # 单行文字栈
- fcur=fcur_=None # 单行字体
- log.debug(f"< {y} {x} {lt} {rt} {size} {font.fontname} {lb} > {sstk[id]} | {new}")
+
+ _x, _y = 0, 0
+ for id, new in enumerate(news): # 排版文字和公式
+ tx = x = pstk[id][1]
+ y = pstk[id][0]
+ lt = pstk[id][2]
+ rt = pstk[id][3]
+ ptr = 0
+ size = pstk[id][4]
+ font = pstk[id][5]
+ lb = pstk[id][6] # 段落属性
+ cstk = "" # 单行文字栈
+ fcur = fcur_ = None # 单行字体
+ log.debug(
+ f"< {y} {x} {lt} {rt} {size} {font.fontname} {lb} > {sstk[id]} | {new}"
+ )
while True:
- if ptr==len(new): # 到达段落结尾
+ if ptr == len(new): # 到达段落结尾
if cstk:
- ops+=f'/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur,cstk)}>] TJ '
+ ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
break
- vy_regex=re.match(r'\$?\s*v([\d\s]+)\$',new[ptr:],re.IGNORECASE) # 匹配 $vn$ 公式标记,前面的 $ 有的时候会被丢掉
- mod=False # 当前公式是否为文字修饰符
- if vy_regex: # 加载公式
- ptr+=len(vy_regex.group(0))
+ vy_regex = re.match(
+ r"\$?\s*v([\d\s]+)\$", new[ptr:], re.IGNORECASE
+ ) # 匹配 $vn$ 公式标记,前面的 $ 有的时候会被丢掉
+ mod = False # 当前公式是否为文字修饰符
+ if vy_regex: # 加载公式
+ ptr += len(vy_regex.group(0))
try:
- vid=int(vy_regex.group(1).replace(' ',''))
- adv=vlen[vid]
- except:
- continue # 翻译器可能会自动补个越界的公式标记
- if len(var[vid])==1 and unicodedata.category(var[vid][0].get_text()[0]) in ['Lm','Mn','Sk']: # 文字修饰符
- mod=True
- else: # 加载文字
- ch=new[ptr]
+ vid = int(vy_regex.group(1).replace(" ", ""))
+ adv = vlen[vid]
+ except Exception:
+ continue # 翻译器可能会自动补个越界的公式标记
+ if len(var[vid]) == 1 and unicodedata.category(
+ var[vid][0].get_text()[0]
+ ) in [
+ "Lm",
+ "Mn",
+ "Sk",
+ ]: # 文字修饰符
+ mod = True
+ else: # 加载文字
+ ch = new[ptr]
# if font.char_width(ord(ch)):
- fcur_=None
+ fcur_ = None
# 原字体编码容易出问题,这里直接放弃掉
# try:
# if font.widths.get(ord(ch)) and font.to_unichr(ord(ch))==ch:
@@ -578,58 +662,84 @@ def raw_string(fcur,cstk): # 编码字符串
# except:
# pass
try:
- if fcur_==None and self.fontmap['tiro'].to_unichr(ord(ch))==ch:
- fcur_='tiro' # 默认英文字体
- except:
+ if (
+ fcur_ is None
+ and self.fontmap["tiro"].to_unichr(ord(ch)) == ch
+ ):
+ fcur_ = "tiro" # 默认英文字体
+ except Exception:
pass
- if fcur_==None:
- fcur_='china-ss' # 默认中文字体
+ if fcur_ is None:
+ fcur_ = "china-ss" # 默认中文字体
# print(self.fontid[font],fcur_,ch,font.char_width(ord(ch)))
- adv=self.fontmap[fcur_].char_width(ord(ch))*size
- ptr+=1
- if fcur_!=fcur or vy_regex or x+adv>rt+0.1*size: # 输出文字缓冲区:1.字体更新 2.插入公式 3.到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
+ adv = self.fontmap[fcur_].char_width(ord(ch)) * size
+ ptr += 1
+ if (
+ fcur_ != fcur or vy_regex or x + adv > rt + 0.1 * size
+ ): # 输出文字缓冲区:1.字体更新 2.插入公式 3.到达右边界(可能一整行都被符号化,这里需要考虑浮点误差)
if cstk:
- ops+=f'/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur,cstk)}>] TJ '
- cstk=''
- if lb and x+adv>rt+0.1*size: # 到达右边界且原文段落存在换行
- x=lt
- lang_space={'zh-CN':1.4,'zh-TW':1.4,'ja':1.1,'ko':1.2,'en':1.2} # CJK
- y-=size*lang_space.get(self.translator.lang_out,1.1) # 小语种大多适配 1.1
- if vy_regex: # 插入公式
- fix=0
- if fcur!=None: # 段落内公式修正纵向偏移
- fix=varf[vid]
- for vch in var[vid]: # 排版公式字符
- vc=chr(vch.cid)
- ops+=f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x+vch.x0-var[vid][0].x0:f} {fix+y+vch.y0-var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font],vc)}>] TJ "
+ ops += f"/{fcur} {size:f} Tf 1 0 0 1 {tx:f} {y:f} Tm [<{raw_string(fcur, cstk)}>] TJ "
+ cstk = ""
+ if lb and x + adv > rt + 0.1 * size: # 到达右边界且原文段落存在换行
+ x = lt
+ lang_space = {
+ "zh-CN": 1.4,
+ "zh-TW": 1.4,
+ "ja": 1.1,
+ "ko": 1.2,
+ "en": 1.2,
+ } # CJK
+ y -= size * lang_space.get(
+ self.translator.lang_out, 1.1
+ ) # 小语种大多适配 1.1
+ if vy_regex: # 插入公式
+ fix = 0
+ if fcur is not None: # 段落内公式修正纵向偏移
+ fix = varf[vid]
+ for vch in var[vid]: # 排版公式字符
+ vc = chr(vch.cid)
+ ops += f"/{self.fontid[vch.font]} {vch.size:f} Tf 1 0 0 1 {x + vch.x0 - var[vid][0].x0:f} {fix + y + vch.y0 - var[vid][0].y0:f} Tm [<{raw_string(self.fontid[vch.font], vc)}>] TJ " # noqa: E501
if log.isEnabledFor(logging.DEBUG):
- lstk.append(LTLine(0.1,(_x,_y),(x+vch.x0-var[vid][0].x0,fix+y+vch.y0-var[vid][0].y0)))
- _x,_y=x+vch.x0-var[vid][0].x0,fix+y+vch.y0-var[vid][0].y0
- for l in varl[vid]: # 排版公式线条
- if l.linewidth<5: # hack
- ops+=f"ET q 1 0 0 1 {l.pts[0][0]+x-var[vid][0].x0:f} {l.pts[0][1]+fix+y-var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0]-l.pts[0][0]:f} {l.pts[1][1]-l.pts[0][1]:f} l S Q BT "
- else: # 插入文字缓冲区
- if not cstk: # 单行开头
- tx=x
- if x==lt and ch==' ': # 消除段落换行空格
- adv=0
+ lstk.append(
+ LTLine(
+ 0.1,
+ (_x, _y),
+ (
+ x + vch.x0 - var[vid][0].x0,
+ fix + y + vch.y0 - var[vid][0].y0,
+ ),
+ )
+ )
+ _x, _y = (
+ x + vch.x0 - var[vid][0].x0,
+ fix + y + vch.y0 - var[vid][0].y0,
+ )
+ for l in varl[vid]: # 排版公式线条 # noqa: E741
+ if l.linewidth < 5: # hack
+ ops += f"ET q 1 0 0 1 {l.pts[0][0] + x - var[vid][0].x0:f} {l.pts[0][1] + fix + y - var[vid][0].y0:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT " # noqa: E501
+ else: # 插入文字缓冲区
+ if not cstk: # 单行开头
+ tx = x
+ if x == lt and ch == " ": # 消除段落换行空格
+ adv = 0
else:
- cstk+=ch
+ cstk += ch
else:
- cstk+=ch
- if mod: # 文字修饰符
- adv=0
- fcur=fcur_
- x+=adv
+ cstk += ch
+ if mod: # 文字修饰符
+ adv = 0
+ fcur = fcur_
+ x += adv
if log.isEnabledFor(logging.DEBUG):
- lstk.append(LTLine(0.1,(_x,_y),(x,y)))
- _x,_y=x,y
- for l in lstk: # 排版全局线条
- if l.linewidth<5: # hack
- ops+=f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0]-l.pts[0][0]:f} {l.pts[1][1]-l.pts[0][1]:f} l S Q BT "
- ops=f'BT {ops}ET '
+ lstk.append(LTLine(0.1, (_x, _y), (x, y)))
+ _x, _y = x, y
+ for l in lstk: # 排版全局线条 # noqa: E741
+ if l.linewidth < 5: # hack
+ ops += f"ET q 1 0 0 1 {l.pts[0][0]:f} {l.pts[0][1]:f} cm [] 0 d 0 J {l.linewidth:f} w 0 0 m {l.pts[1][0] - l.pts[0][0]:f} {l.pts[1][1] - l.pts[0][1]:f} l S Q BT " # noqa: E501
+ ops = f"BT {ops}ET "
return ops
- ops=render(ltpage)
+
+ ops = render(ltpage)
return ops
# Some dummy functions to save memory/CPU when all that is wanted
diff --git a/pdf2zh/encodingdb.py b/pdf2zh/encodingdb.py
index 877ea892..ee6a1061 100644
--- a/pdf2zh/encodingdb.py
+++ b/pdf2zh/encodingdb.py
@@ -120,7 +120,7 @@ def get_encoding(
elif isinstance(x, PSLiteral):
try:
cid2unicode[cid] = name2unicode(cast(str, x.name))
- except (KeyError, ValueError) as e:
+ except (KeyError, ValueError):
# log.debug(str(e))
pass
cid += 1
diff --git a/pdf2zh/fontmetrics.py b/pdf2zh/fontmetrics.py
index b6780b96..c95c1c1d 100644
--- a/pdf2zh/fontmetrics.py
+++ b/pdf2zh/fontmetrics.py
@@ -9,7 +9,7 @@
"""
-### BEGIN Verbatim copy of the license part
+# BEGIN Verbatim copy of the license part
#
# Adobe Core 35 AFM Files with 314 Glyph Entries - ReadMe
@@ -24,7 +24,7 @@
# obligation to support the use of the AFM files.
#
-### END Verbatim copy of the license part
+# END Verbatim copy of the license part
# flake8: noqa
from typing import Dict
diff --git a/pdf2zh/gui.py b/pdf2zh/gui.py
index b35117db..8b5b4a17 100644
--- a/pdf2zh/gui.py
+++ b/pdf2zh/gui.py
@@ -33,32 +33,32 @@
page_map = {
"All": None,
"First": [0],
- "First 5 pages": list(range(0,5)),
+ "First 5 pages": list(range(0, 5)),
}
-flag_demo=False
-if os.environ.get('PDF2ZH_DEMO'):
- flag_demo=True
+flag_demo = False
+if os.environ.get("PDF2ZH_DEMO"):
+ flag_demo = True
service_map = {
"Google": "google",
}
page_map = {
"First": [0],
- "First 20 pages": list(range(0,20)),
+ "First 20 pages": list(range(0, 20)),
}
- client_key=os.environ.get('PDF2ZH_CLIENT_KEY')
- server_key=os.environ.get('PDF2ZH_SERVER_KEY')
+ client_key = os.environ.get("PDF2ZH_CLIENT_KEY")
+ server_key = os.environ.get("PDF2ZH_SERVER_KEY")
def verify_recaptcha(response):
recaptcha_url = "https://www.google.com/recaptcha/api/siteverify"
- print('reCAPTCHA',server_key,response)
+ print("reCAPTCHA", server_key, response)
data = {"secret": server_key, "response": response}
result = requests.post(recaptcha_url, data=data).json()
- print('reCAPTCHA',result.get("success"))
+ print("reCAPTCHA", result.get("success"))
return result.get("success")
@@ -87,14 +87,20 @@ def upload_file(file, service, progress=gr.Progress()):
def translate(
- file_path, service, model_id, lang, page_range, recaptcha_response, progress=gr.Progress()
+ file_path,
+ service,
+ model_id,
+ lang,
+ page_range,
+ recaptcha_response,
+ progress=gr.Progress(),
):
"""Translate PDF content using selected service."""
if not file_path:
- raise gr.Error('No input')
+ raise gr.Error("No input")
if flag_demo and not verify_recaptcha(recaptcha_response):
- raise gr.Error('reCAPTCHA fail')
+ raise gr.Error("reCAPTCHA fail")
progress(0, desc="Starting translation...")
@@ -113,30 +119,31 @@ def translate(
lang_to = "zh-CN" if lang_to == "zh" else lang_to
print(f"Files before translation: {os.listdir(output)}")
- def progress_bar(t:tqdm.tqdm):
- progress(t.n/t.total, desc="Translating...")
-
- param={
- 'files':[file_en],
- 'pages':selected_page,
- 'lang_in':'auto',
- 'lang_out':lang_to,
- 'service':f"{selected_service}:{model_id}",
- 'output':output,
- 'thread':4,
- 'callback':progress_bar,
- }
+
+ def progress_bar(t: tqdm.tqdm):
+ progress(t.n / t.total, desc="Translating...")
+
+ param = {
+ "files": [file_en],
+ "pages": selected_page,
+ "lang_in": "auto",
+ "lang_out": lang_to,
+ "service": f"{selected_service}:{model_id}",
+ "output": output,
+ "thread": 4,
+ "callback": progress_bar,
+ }
print(param)
extract_text(**param)
print(f"Files after translation: {os.listdir(output)}")
if not file_zh.exists() or not file_dual.exists():
- raise gr.Error('No output')
+ raise gr.Error("No output")
try:
translated_preview = pdf_preview(str(file_zh))
- except Exception as e:
- raise gr.Error('No preview')
+ except Exception:
+ raise gr.Error("No preview")
progress(1.0, desc="Translation complete!")
@@ -175,7 +182,7 @@ def progress_bar(t:tqdm.tqdm):
footer {visibility: hidden}
.env-warning {color: #dd5500 !important;}
.env-success {color: #559900 !important;}
-
+
@keyframes pulse-background {
0% { background-color: #FFFFFF; }
25% { background-color: #FFFFFF; }
@@ -183,7 +190,7 @@ def progress_bar(t:tqdm.tqdm):
75% { background-color: #FFFFFF; }
100% { background-color: #FFFFFF; }
}
-
+
/* Add dashed border to input-file class */
.input-file {
border: 1.2px dashed #165DFF !important;
@@ -232,7 +239,9 @@ def progress_bar(t:tqdm.tqdm):
''' if flag_demo else ""
) as demo:
- gr.Markdown("# [PDFMathTranslate @ Github](https://github.com/Byaidu/PDFMathTranslate)")
+ gr.Markdown(
+ "# [PDFMathTranslate @ Github](https://github.com/Byaidu/PDFMathTranslate)"
+ )
with gr.Row():
with gr.Column(scale=1):
@@ -265,14 +274,15 @@ def progress_bar(t:tqdm.tqdm):
)
model_id = gr.Textbox(
label="Model ID",
- info="Please enter the identifier of the model you wish to use (e.g., gemma2). This identifier will be used to specify the particular model for translation.",
+ info="Please enter the identifier of the model you wish to use (e.g., gemma2). "
+ "This identifier will be used to specify the particular model for translation.",
# value="gemma2",
visible=False, # hide by default
)
envs_status = "- Properly configured.
"
def details_wrapper(text_markdown):
- text = f"""
+ text = f"""
Technical details
{text_markdown}
@@ -287,7 +297,11 @@ def env_var_checker(env_var_name: str) -> str:
not os.environ.get(env_var_name)
or os.environ.get(env_var_name) == ""
):
- envs_status = f"- Warning: environmental not found or error ({env_var_name}).
- Please make sure that the environment variables are properly configured (guide).
"
+ envs_status = (
+ f"- Warning: environmental not found or error ({env_var_name})."
+ + "
- Please make sure that the environment variables are properly configured "
+ + "(guide).
"
+ )
else:
value = str(os.environ.get(env_var_name))
envs_status = (
@@ -327,7 +341,11 @@ def on_select_service(value, evt: gr.EventData):
) # show model id when service is selected
envs_status = env_var_checker("OLLAMA_HOST")
else:
- envs_status = "- Warning: model not in the list.
- Please report via (guide).
"
+ envs_status = (
+ "- Warning: model not in the list."
+ "
- Please report via "
+ "(guide).
"
+ )
return envs_status, model_visibility
output_title = gr.Markdown("## Translated", visible=False)
@@ -381,20 +399,28 @@ def on_select_service(value, evt: gr.EventData):
def setup_gui(share=False):
- import doclayout_yolo # cache
+ import doclayout_yolo # cache # noqa: F401
+
if flag_demo:
- demo.launch(server_name="0.0.0.0", max_file_size='5mb', inbrowser=True)
+ demo.launch(server_name="0.0.0.0", max_file_size="5mb", inbrowser=True)
else:
try:
demo.launch(server_name="0.0.0.0", debug=True, inbrowser=True, share=share)
except Exception:
- print("Error launching GUI using 0.0.0.0.\nThis may be caused by global mode of proxy software.")
+ print(
+ "Error launching GUI using 0.0.0.0.\nThis may be caused by global mode of proxy software."
+ )
try:
- demo.launch(server_name="127.0.0.1", debug=True, inbrowser=True, share=share)
+ demo.launch(
+ server_name="127.0.0.1", debug=True, inbrowser=True, share=share
+ )
except Exception:
- print("Error launching GUI using 127.0.0.1.\nThis may be caused by global mode of proxy software.")
+ print(
+ "Error launching GUI using 127.0.0.1.\nThis may be caused by global mode of proxy software."
+ )
demo.launch(debug=True, inbrowser=True, share=True)
+
# For auto-reloading while developing
if __name__ == "__main__":
setup_gui()
diff --git a/pdf2zh/high_level.py b/pdf2zh/high_level.py
index eeb980e0..940d5df9 100644
--- a/pdf2zh/high_level.py
+++ b/pdf2zh/high_level.py
@@ -46,7 +46,7 @@ def extract_text_to_fp(
vchar: str = "",
thread: int = 0,
doc_en: Document = None,
- model = None,
+ model=None,
lang_in: str = "",
lang_out: str = "",
service: str = "",
@@ -91,7 +91,7 @@ def extract_text_to_fp(
rsrcmgr = PDFResourceManager(caching=not disable_caching)
device: Optional[PDFDevice] = None
- layout={}
+ layout = {}
if output_type != "text" and outfp == sys.stdout:
outfp = sys.stdout.buffer
@@ -151,50 +151,68 @@ def extract_text_to_fp(
raise PDFValueError(msg)
assert device is not None
- obj_patch={}
+ obj_patch = {}
interpreter = PDFPageInterpreter(rsrcmgr, device, obj_patch)
if pages:
- total_pages=len(pages)
+ total_pages = len(pages)
else:
- total_pages=page_count
- with tqdm.tqdm(PDFPage.get_pages(
- inf,
- pages,
- maxpages=maxpages,
- password=password,
- caching=not disable_caching,
- ), total=total_pages, position=0) as progress:
+ total_pages = page_count
+ with tqdm.tqdm(
+ PDFPage.get_pages(
+ inf,
+ pages,
+ maxpages=maxpages,
+ password=password,
+ caching=not disable_caching,
+ ),
+ total=total_pages,
+ position=0,
+ ) as progress:
for page in progress:
if callback:
callback(progress)
pix = doc_en[page.pageno].get_pixmap()
- image = np.fromstring(pix.samples, np.uint8).reshape(pix.height, pix.width, 3)[:, :, ::-1]
- page_layout=model.predict(
+ image = np.fromstring(pix.samples, np.uint8).reshape(
+ pix.height, pix.width, 3
+ )[:, :, ::-1]
+ page_layout = model.predict(
image,
- imgsz=int(pix.height/32)*32,
- device="cuda:0" if torch.cuda.is_available() else "cpu", # Auto-select GPU if available
+ imgsz=int(pix.height / 32) * 32,
+ device=(
+ "cuda:0" if torch.cuda.is_available() else "cpu"
+ ), # Auto-select GPU if available
)[0]
# kdtree 是不可能 kdtree 的,不如直接渲染成图片,用空间换时间
- box=np.ones((pix.height, pix.width))
- h,w=box.shape
- vcls=['abandon','figure','table','isolate_formula','formula_caption']
- for i,d in enumerate(page_layout.boxes):
+ box = np.ones((pix.height, pix.width))
+ h, w = box.shape
+ vcls = ["abandon", "figure", "table", "isolate_formula", "formula_caption"]
+ for i, d in enumerate(page_layout.boxes):
if not page_layout.names[int(d.cls)] in vcls:
- x0,y0,x1,y1=d.xyxy.squeeze()
- x0,y0,x1,y1=np.clip(int(x0-1),0,w-1),np.clip(int(h-y1-1),0,h-1),np.clip(int(x1+1),0,w-1),np.clip(int(h-y0+1),0,h-1)
- box[y0:y1,x0:x1]=i+2
- for i,d in enumerate(page_layout.boxes):
+ x0, y0, x1, y1 = d.xyxy.squeeze()
+ x0, y0, x1, y1 = (
+ np.clip(int(x0 - 1), 0, w - 1),
+ np.clip(int(h - y1 - 1), 0, h - 1),
+ np.clip(int(x1 + 1), 0, w - 1),
+ np.clip(int(h - y0 + 1), 0, h - 1),
+ )
+ box[y0:y1, x0:x1] = i + 2
+ for i, d in enumerate(page_layout.boxes):
if page_layout.names[int(d.cls)] in vcls:
- x0,y0,x1,y1=d.xyxy.squeeze()
- x0,y0,x1,y1=np.clip(int(x0-1),0,w-1),np.clip(int(h-y1-1),0,h-1),np.clip(int(x1+1),0,w-1),np.clip(int(h-y0+1),0,h-1)
- box[y0:y1,x0:x1]=0
- layout[page.pageno]=box
+ x0, y0, x1, y1 = d.xyxy.squeeze()
+ x0, y0, x1, y1 = (
+ np.clip(int(x0 - 1), 0, w - 1),
+ np.clip(int(h - y1 - 1), 0, h - 1),
+ np.clip(int(x1 + 1), 0, w - 1),
+ np.clip(int(h - y0 + 1), 0, h - 1),
+ )
+ box[y0:y1, x0:x1] = 0
+ layout[page.pageno] = box
# print(page.number,page_layout)
page.rotate = (page.rotate + rotation) % 360
# 新建一个 xref 存放新指令流
- page.page_xref = doc_en.get_new_xref() # hack
+ page.page_xref = doc_en.get_new_xref() # hack
doc_en.update_object(page.page_xref, "<<>>")
- doc_en.update_stream(page.page_xref,b'')
+ doc_en.update_stream(page.page_xref, b"")
doc_en[page.pageno].set_contents(page.page_xref)
interpreter.process_page(page)
diff --git a/pdf2zh/layout.py b/pdf2zh/layout.py
index 6327ba6c..09208562 100644
--- a/pdf2zh/layout.py
+++ b/pdf2zh/layout.py
@@ -368,7 +368,7 @@ def __init__(
LTText.__init__(self)
self._text = text
self.matrix = matrix
- self.font=font
+ self.font = font
self.fontname = font.fontname
self.ncs = ncs
self.graphicstate = graphicstate
@@ -387,7 +387,7 @@ def __init__(
bbox_upper_right = (-vx + fontsize, vy + rise)
else:
# horizontal
- descent = 0 # descent = font.get_descent() * fontsize
+ descent = 0 # descent = font.get_descent() * fontsize
bbox_lower_left = (0, descent + rise)
bbox_upper_right = (self.adv, descent + rise + fontsize)
(a, b, c, d, e, f) = self.matrix
@@ -405,7 +405,14 @@ def __init__(
self.size = self.height
def __repr__(self) -> str:
- return f"<{self.__class__.__name__} {bbox2str(self.bbox)} matrix={matrix2str(self.matrix)} font={self.fontname!r} adv={self.adv} text={self.get_text()!r}>"
+ return "<{} {} matrix={} font={} adv={} text={}>".format(
+ self.__class__.__name__,
+ bbox2str(self.bbox),
+ matrix2str(self.matrix),
+ repr(self.fontname),
+ self.adv,
+ repr(self.get_text()),
+ )
def get_text(self) -> str:
return self._text
diff --git a/pdf2zh/pdf2zh.py b/pdf2zh/pdf2zh.py
index 88a71f0c..7153232c 100644
--- a/pdf2zh/pdf2zh.py
+++ b/pdf2zh/pdf2zh.py
@@ -119,7 +119,7 @@ def extract_text(
doc_en.xref_set_key(
xref, f"{label}Font/{font}", f"{font_id[font]} 0 R"
)
- except:
+ except Exception:
pass
doc_en.save(Path(output) / f"{filename}-en.pdf")
@@ -277,6 +277,7 @@ def main(args: Optional[List[str]] = None) -> int:
return -1
if parsed_args.interactive:
from pdf2zh.gui import setup_gui
+
setup_gui(parsed_args.share)
return 0
diff --git a/pdf2zh/pdfdocument.py b/pdf2zh/pdfdocument.py
index ac0c3272..535459eb 100644
--- a/pdf2zh/pdfdocument.py
+++ b/pdf2zh/pdfdocument.py
@@ -706,7 +706,7 @@ def __init__(
try:
# print('FIND XREF')
pos = self.find_xref(parser)
- self.pos=pos
+ self.pos = pos
self.read_xref_from(parser, pos, self.xrefs)
except PDFNoValidXRef:
if fallback:
diff --git a/pdf2zh/pdffont.py b/pdf2zh/pdffont.py
index 32ccc9ed..5591e1e9 100644
--- a/pdf2zh/pdffont.py
+++ b/pdf2zh/pdffont.py
@@ -140,7 +140,7 @@ def get_encoding(self) -> Dict[int, str]:
break
try:
self._cid2unicode[cid] = name2unicode(cast(str, name))
- except KeyError as e:
+ except KeyError:
# log.debug(str(e))
pass
return self._cid2unicode
diff --git a/pdf2zh/pdfinterp.py b/pdf2zh/pdfinterp.py
index 12306a1a..b9d23382 100644
--- a/pdf2zh/pdfinterp.py
+++ b/pdf2zh/pdfinterp.py
@@ -368,7 +368,9 @@ class PDFPageInterpreter:
Reference: PDF Reference, Appendix A, Operator Summary
"""
- def __init__(self, rsrcmgr: PDFResourceManager, device: PDFDevice, obj_patch) -> None:
+ def __init__(
+ self, rsrcmgr: PDFResourceManager, device: PDFDevice, obj_patch
+ ) -> None:
self.rsrcmgr = rsrcmgr
self.device = device
self.obj_patch = obj_patch
@@ -407,7 +409,7 @@ def get_colorspace(spec: object) -> Optional[PDFColorSpace]:
objid = spec.objid
spec = dict_value(spec)
self.fontmap[fontid] = self.rsrcmgr.get_font(objid, spec)
- self.fontid[self.fontmap[fontid]]=fontid
+ self.fontid[self.fontmap[fontid]] = fontid
elif k == "ColorSpace":
for csid, spec in dict_value(v).items():
colorspace = get_colorspace(resolve1(spec))
@@ -570,16 +572,25 @@ def do_re(self, x: PDFStackT, y: PDFStackT, w: PDFStackT, h: PDFStackT) -> None:
def do_S(self) -> None:
"""Stroke path"""
+
def is_black(color: Color) -> bool:
if isinstance(color, Tuple):
- return sum(color)==0
+ return sum(color) == 0
else:
- return color==0
- if len(self.curpath)==2 and self.curpath[0][0]=='m' and self.curpath[1][0]=='l' and apply_matrix_pt(self.ctm,self.curpath[0][-2:])[1]==apply_matrix_pt(self.ctm,self.curpath[1][-2:])[1] and is_black(self.graphicstate.scolor): # 独立直线,水平,黑色
+ return color == 0
+
+ if (
+ len(self.curpath) == 2
+ and self.curpath[0][0] == "m"
+ and self.curpath[1][0] == "l"
+ and apply_matrix_pt(self.ctm, self.curpath[0][-2:])[1]
+ == apply_matrix_pt(self.ctm, self.curpath[1][-2:])[1]
+ and is_black(self.graphicstate.scolor)
+ ): # 独立直线,水平,黑色
# print(apply_matrix_pt(self.ctm,self.curpath[0][-2:]),apply_matrix_pt(self.ctm,self.curpath[1][-2:]),self.graphicstate.scolor)
self.device.paint_path(self.graphicstate, True, False, False, self.curpath)
self.curpath = []
- return 'n'
+ return "n"
else:
self.curpath = []
@@ -698,7 +709,7 @@ def do_SCN(self) -> None:
if settings.STRICT:
raise PDFInterpreterError("No colorspace specified!")
n = 1
- args=self.pop(n)
+ args = self.pop(n)
self.graphicstate.scolor = cast(Color, args)
return args
@@ -710,7 +721,7 @@ def do_scn(self) -> None:
if settings.STRICT:
raise PDFInterpreterError("No colorspace specified!")
n = 1
- args=self.pop(n)
+ args = self.pop(n)
self.graphicstate.ncolor = cast(Color, args)
return args
@@ -963,22 +974,24 @@ def do_Do(self, xobjid_arg: PDFStackT) -> None:
else:
resources = self.resources.copy()
self.device.begin_figure(xobjid, bbox, matrix)
- ctm=mult_matrix(matrix, self.ctm)
- ops_base=interpreter.render_contents(
+ ctm = mult_matrix(matrix, self.ctm)
+ ops_base = interpreter.render_contents(
resources,
[xobj],
ctm=ctm,
)
- try: # 有的时候 form 字体加不上这里会烂掉
- self.device.fontid=interpreter.fontid
- self.device.fontmap=interpreter.fontmap
- ops_new=self.device.end_figure(xobjid)
- ctm_inv=np.linalg.inv(np.array(ctm[:4]).reshape(2,2))
- pos_inv=-np.mat(ctm[4:])*ctm_inv
- a,b,c,d=ctm_inv.reshape(4).tolist()
- e,f=pos_inv.tolist()[0]
- self.obj_patch[self.xobjmap[xobjid].objid]=f'q {ops_base}Q {a} {b} {c} {d} {e} {f} cm {ops_new}'
- except:
+ try: # 有的时候 form 字体加不上这里会烂掉
+ self.device.fontid = interpreter.fontid
+ self.device.fontmap = interpreter.fontmap
+ ops_new = self.device.end_figure(xobjid)
+ ctm_inv = np.linalg.inv(np.array(ctm[:4]).reshape(2, 2))
+ pos_inv = -np.mat(ctm[4:]) * ctm_inv
+ a, b, c, d = ctm_inv.reshape(4).tolist()
+ e, f = pos_inv.tolist()[0]
+ self.obj_patch[self.xobjmap[xobjid].objid] = (
+ f"q {ops_base}Q {a} {b} {c} {d} {e} {f} cm {ops_new}"
+ )
+ except Exception:
pass
elif subtype is LITERAL_IMAGE and "Width" in xobj and "Height" in xobj:
self.device.begin_figure(xobjid, (0, 0, 1, 1), MATRIX_IDENTITY)
@@ -1002,14 +1015,16 @@ def process_page(self, page: PDFPage) -> None:
else:
ctm = (1, 0, 0, 1, -x0, -y0)
self.device.begin_page(page, ctm)
- ops_base=self.render_contents(page.resources, page.contents, ctm=ctm)
- self.device.fontid=self.fontid
- self.device.fontmap=self.fontmap
- ops_new=self.device.end_page(page)
+ ops_base = self.render_contents(page.resources, page.contents, ctm=ctm)
+ self.device.fontid = self.fontid
+ self.device.fontmap = self.fontmap
+ ops_new = self.device.end_page(page)
# 上面渲染的时候会根据 cropbox 减掉页面偏移得到真实坐标,这里输出的时候需要用 cm 把页面偏移加回来
- self.obj_patch[page.page_xref]=f'q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}' # ops_base 里可能有图,需要让 ops_new 里的文字覆盖在上面,使用 q/Q 重置位置矩阵
+ self.obj_patch[page.page_xref] = (
+ f"q {ops_base}Q 1 0 0 1 {x0} {y0} cm {ops_new}" # ops_base 里可能有图,需要让 ops_new 里的文字覆盖在上面,使用 q/Q 重置位置矩阵
+ )
for obj in page.contents:
- self.obj_patch[obj.objid]=''
+ self.obj_patch[obj.objid] = ""
def render_contents(
self,
@@ -1032,7 +1047,7 @@ def render_contents(
return self.execute(list_value(streams))
def execute(self, streams: Sequence[object]) -> None:
- ops=''
+ ops = ""
try:
parser = PDFContentParser(streams)
except PSEOF:
@@ -1057,17 +1072,38 @@ def execute(self, streams: Sequence[object]) -> None:
# log.debug("exec: %s %r", name, args)
if len(args) == nargs:
func(*args)
- if not (name[0]=='T' or name in ['"',"'",'EI','MP','DP','BMC','BDC']): # 过滤 T 系列文字指令,因为 EI 的参数是 obj 所以也需要过滤(只在少数文档中画横线时使用),过滤 marked 系列指令
- p=" ".join([f'{x:f}' if isinstance(x,float) else str(x).replace("'","") for x in args])
- ops+=f'{p} {name} '
+ if not (
+ name[0] == "T"
+ or name in ['"', "'", "EI", "MP", "DP", "BMC", "BDC"]
+ ): # 过滤 T 系列文字指令,因为 EI 的参数是 obj 所以也需要过滤(只在少数文档中画横线时使用),过滤 marked 系列指令
+ p = " ".join(
+ [
+ (
+ f"{x:f}"
+ if isinstance(x, float)
+ else str(x).replace("'", "")
+ )
+ for x in args
+ ]
+ )
+ ops += f"{p} {name} "
else:
# log.debug("exec: %s", name)
- targs=func()
- if targs==None:
- targs=[]
- if not (name[0]=='T' or name in ['BI','ID','EMC']):
- p=" ".join([f'{x:f}' if isinstance(x,float) else str(x).replace("'","") for x in targs])
- ops+=f'{p} {name} '
+ targs = func()
+ if targs is None:
+ targs = []
+ if not (name[0] == "T" or name in ["BI", "ID", "EMC"]):
+ p = " ".join(
+ [
+ (
+ f"{x:f}"
+ if isinstance(x, float)
+ else str(x).replace("'", "")
+ )
+ for x in targs
+ ]
+ )
+ ops += f"{p} {name} "
elif settings.STRICT:
error_msg = "Unknown operator: %r" % name
raise PDFInterpreterError(error_msg)
diff --git a/pdf2zh/pdfpage.py b/pdf2zh/pdfpage.py
index 2864f077..e6ac705d 100644
--- a/pdf2zh/pdfpage.py
+++ b/pdf2zh/pdfpage.py
@@ -188,7 +188,7 @@ def get_pages(
log.warning(warning_msg)
# Process each page contained in the document.
for pageno, page in enumerate(cls.create_pages(doc)):
- page.pageno=pageno
+ page.pageno = pageno
if pagenos and (pageno not in pagenos):
continue
yield page
diff --git a/pdf2zh/psparser.py b/pdf2zh/psparser.py
index 7472e540..1249153c 100644
--- a/pdf2zh/psparser.py
+++ b/pdf2zh/psparser.py
@@ -580,7 +580,7 @@ def nextobject(self) -> PSStackEntry[ExtraT]:
:return: keywords, literals, strings, numbers, arrays and dictionaries.
"""
- end=None
+ end = None
while not self.results:
(pos, token) = self.nexttoken()
if isinstance(token, (int, float, bool, str, bytes, PSLiteral)):
@@ -632,8 +632,8 @@ def nextobject(self) -> PSStackEntry[ExtraT]:
# token,
# self.curstack,
# )
- if token.name==b'endobj':
- end=pos+7
+ if token.name == b"endobj":
+ end = pos + 7
self.do_keyword(pos, token)
else:
log.error(
@@ -653,4 +653,4 @@ def nextobject(self) -> PSStackEntry[ExtraT]:
# log.debug("nextobject: %r", obj)
# except Exception:
# log.debug("nextobject: (unprintable object)")
- return end,obj
+ return end, obj
diff --git a/pdf2zh/translator.py b/pdf2zh/translator.py
index 173bade0..58348608 100644
--- a/pdf2zh/translator.py
+++ b/pdf2zh/translator.py
@@ -19,11 +19,7 @@ def __init__(self, service, lang_out, lang_in, model):
self.lang_in = lang_in
self.model = model
- def translate(self, text) -> str:
- ...
-
- def __str__(self):
- pass
+ def translate(self, text) -> str: ... # noqa: E704
def __str__(self):
return f"{self.service} {self.lang_out} {self.lang_in}"
@@ -37,7 +33,7 @@ def __init__(self, service, lang_out, lang_in, model):
self.session = requests.Session()
self.base_link = "http://translate.google.com/m"
self.headers = {
- "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)"
+ "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501
}
def translate(self, text):
@@ -51,7 +47,7 @@ def translate(self, text):
r'(?s)class="(?:t0|result-container)">(.*?)<', response.text
)
if response.status_code == 400:
- result = 'IRREPARABLE TRANSLATION ERROR'
+ result = "IRREPARABLE TRANSLATION ERROR"
elif len(re_result) == 0:
raise ValueError("Empty translation result")
else:
@@ -80,7 +76,7 @@ def __init__(self, service, lang_out, lang_in, model):
self.session = requests.Session()
self.base_link = f"{server_url}/{auth_key}/translate"
self.headers = {
- "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)"
+ "User-Agent": "Mozilla/4.0 (compatible;MSIE 6.0;Windows NT 5.1;SV1;.NET CLR 1.1.4322;.NET CLR 2.0.50727;.NET CLR 3.0.04506.30)" # noqa: E501
}
def translate(self, text):
@@ -115,27 +111,25 @@ def translate(self, text):
class DeepLTranslator(BaseTranslator):
def __init__(self, service, lang_out, lang_in, model):
- lang_out='ZH' if lang_out=='auto' else lang_out
- lang_in='EN' if lang_in=='auto' else lang_in
+ lang_out = "ZH" if lang_out == "auto" else lang_out
+ lang_in = "EN" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
self.session = requests.Session()
- auth_key = os.getenv('DEEPL_AUTH_KEY')
- server_url = os.getenv('DEEPL_SERVER_URL')
+ auth_key = os.getenv("DEEPL_AUTH_KEY")
+ server_url = os.getenv("DEEPL_SERVER_URL")
self.client = deepl.Translator(auth_key, server_url=server_url)
def translate(self, text):
response = self.client.translate_text(
- text,
- target_lang=self.lang_out,
- source_lang=self.lang_in
+ text, target_lang=self.lang_out, source_lang=self.lang_in
)
return response.text
class OllamaTranslator(BaseTranslator):
def __init__(self, service, lang_out, lang_in, model):
- lang_out='zh-CN' if lang_out=='auto' else lang_out
- lang_in='en' if lang_in=='auto' else lang_in
+ lang_out = "zh-CN" if lang_out == "auto" else lang_out
+ lang_in = "en" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
self.options = {"temperature": 0} # 随机采样可能会打断公式标记
# OLLAMA_HOST
@@ -152,16 +146,17 @@ def translate(self, text):
},
{
"role": "user",
- "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:",
+ "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501
},
],
)
return response["message"]["content"].strip()
+
class OpenAITranslator(BaseTranslator):
def __init__(self, service, lang_out, lang_in, model):
- lang_out='zh-CN' if lang_out=='auto' else lang_out
- lang_in='en' if lang_in=='auto' else lang_in
+ lang_out = "zh-CN" if lang_out == "auto" else lang_out
+ lang_in = "en" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
self.options = {"temperature": 0} # 随机采样可能会打断公式标记
# OPENAI_BASE_URL
@@ -179,7 +174,7 @@ def translate(self, text) -> str:
},
{
"role": "user",
- "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:",
+ "content": f"Translate the following markdown source text to {self.lang_out}. Keep the formula notation $v*$ unchanged. Output translation directly without any additional text.\nSource Text: {text}\nTranslated Text:", # noqa: E501
},
],
)
@@ -188,8 +183,8 @@ def translate(self, text) -> str:
class AzureTranslator(BaseTranslator):
def __init__(self, service, lang_out, lang_in, model):
- lang_out='zh-Hans' if lang_out=='auto' else lang_out
- lang_in='en' if lang_in=='auto' else lang_in
+ lang_out = "zh-Hans" if lang_out == "auto" else lang_out
+ lang_in = "en" if lang_in == "auto" else lang_in
super().__init__(service, lang_out, lang_in, model)
try:
@@ -198,7 +193,9 @@ def __init__(self, service, lang_out, lang_in, model):
region = os.environ["AZURE_REGION"]
except KeyError as e:
missing_var = e.args[0]
- raise ValueError(f"The environment variable '{missing_var}' is required but not set.") from e
+ raise ValueError(
+ f"The environment variable '{missing_var}' is required but not set."
+ ) from e
credential = AzureKeyCredential(api_key)
self.client = TextTranslationClient(
diff --git a/pdf2zh/utils.py b/pdf2zh/utils.py
index f76d78e3..25697fdf 100644
--- a/pdf2zh/utils.py
+++ b/pdf2zh/utils.py
@@ -284,9 +284,11 @@ def apply_matrix_norm(m: Matrix, v: Point) -> Point:
(p, q) = v
return a * p + c * q, b * p + d * q
+
def matrix_scale(m: Matrix) -> float:
(a, b, c, d, e, f) = m
- return (a**2+c**2)**0.5
+ return (a**2 + c**2) ** 0.5
+
# Utility functions
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 00000000..2b3e5804
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,46 @@
+[project]
+name = "pdf2zh"
+version = "1.7.9"
+description = "Latex PDF Translator"
+authors = [{ name = "Byaidu", email = "byaidux@gmail.com" }]
+license = "AGPL-3.0"
+readme = "README.md"
+requires-python = ">=3.8,<3.13"
+classifiers = [
+ "Programming Language :: Python :: 3",
+ "Operating System :: OS Independent",
+]
+dependencies = [
+ "charset-normalizer",
+ "cryptography",
+ "requests",
+ "pymupdf",
+ "tqdm",
+ "tenacity",
+ "doclayout-yolo",
+ "numpy",
+ "ollama",
+ "deepl<1.19.1",
+ "openai",
+ "azure-ai-translation-text<=1.0.1",
+ "gradio",
+ "huggingface_hub",
+ "torch",
+]
+
+[project.optional-dependencies]
+dev = [
+ "black",
+ "flake8",
+ "pre-commit"
+]
+
+[project.urls]
+Homepage = "https://github.com/Byaidu/PDFMathTranslate"
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.scripts]
+pdf2zh = "pdf2zh.pdf2zh:main"
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 0cc9bdcf..00000000
--- a/requirements.txt
+++ /dev/null
@@ -1,15 +0,0 @@
-charset-normalizer
-cryptography
-requests
-pymupdf
-tqdm
-tenacity
-doclayout-yolo
-numpy
-ollama
-deepl<1.19.1
-openai
-azure-ai-translation-text<=1.0.1
-gradio
-huggingface_hub
-torch
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
new file mode 100644
index 00000000..053bd42e
--- /dev/null
+++ b/setup.cfg
@@ -0,0 +1,4 @@
+[flake8]
+max-line-length = 120
+ignore = E203,W503,E261
+exclude = .git,build,dist,docs
\ No newline at end of file
diff --git a/setup.py b/setup.py
deleted file mode 100644
index 57225f72..00000000
--- a/setup.py
+++ /dev/null
@@ -1,31 +0,0 @@
-from pdf2zh import __version__, __author__
-from setuptools import setup
-
-with open("README.md", encoding='utf-8') as f:
- readme = f.read()
-
-with open("requirements.txt", encoding='utf-8') as f:
- requirements = f.readlines()
-
-setup(
- name="pdf2zh",
- long_description=readme,
- long_description_content_type="text/markdown",
- description="Latex PDF Translator",
- license="AGPLv3",
- version=__version__,
- author=__author__,
- author_email="byaidux@gmail.com",
- url="https://github.com/Byaidu/PDFMathTranslate",
- packages=["pdf2zh"],
- install_requires=requirements,
- classifiers=[
- "Programming Language :: Python :: 3",
- "Operating System :: OS Independent",
- ],
- entry_points={
- 'console_scripts': [
- 'pdf2zh=pdf2zh.pdf2zh:main',
- ]
- },
-)