Skip to content
This repository has been archived by the owner on Mar 15, 2020. It is now read-only.

Commit

Permalink
Added optional ability to apply xslt transformation to fb2 - after some
Browse files Browse the repository at this point in the history
html entities are encoded but before anything else.

Changed how strings are splitted into words during hyphenation -
better whitespaces handling.

Added unescaping of text strings to allow proper processing of the XSLT
transformation results.
  • Loading branch information
rupor-github committed Feb 17, 2015
1 parent f85f31c commit 4796bcd
Show file tree
Hide file tree
Showing 8 changed files with 71 additions and 32 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# vim
*.swp
*_clip.txt

# releases
*.7z
Expand Down
7 changes: 7 additions & 0 deletions fb2mobi.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,10 @@ def process_file(config, infile, outfile=None):
if not config.current_profile['css']:
config.log.warning(u'Profile does not have link to css file.')

if 'xslt' in config.current_profile and not os.path.exists(config.current_profile['xslt']):
config.log.critical(u'Transformation file {0} not found'.format(config.current_profile['xslt']))
return

if config.kindle_compression_level < 0 or config.kindle_compression_level > 2:
config.log.warning(u'Parameter kindleCompressionLevel should be between 0 and 2, using default value (1).')
config.kindle_compression_level = 1
Expand Down Expand Up @@ -402,6 +406,8 @@ def process(args):
config.kindle_compression_level = args.kindlecompressionlevel
if args.css:
config.current_profile['css'] = args.css
if args.xslt:
config.current_profile['xslt'] = args.xslt
if args.dropcaps is not None:
config.current_profile['dropcaps'] = args.dropcaps
if args.tocmaxlevel:
Expand Down Expand Up @@ -488,6 +494,7 @@ def process(args):

argparser.add_argument('-p', '--profile', type=str, default=None, help=u'Profile name from configuration')
argparser.add_argument('--css', type=str, default=None, help=u'css file name')
argparser.add_argument('--xslt', type=str, default=None, help=u'xslt file name')
dropcaps_group = argparser.add_mutually_exclusive_group()
dropcaps_group.add_argument('--dropcaps', dest='dropcaps', action='store_true', default=None, help=u'Use dropcaps')
dropcaps_group.add_argument('--no-dropcaps', dest='dropcaps', action='store_false', default=None, help=u'Do not use dropcaps')
Expand Down
3 changes: 3 additions & 0 deletions modules/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,9 @@ def _load(self):
else:
self.profiles[prof_name]['parse_css'] = True

elif p.tag == 'xslt':
self.profiles[prof_name]['xslt'] = os.path.abspath(os.path.join(os.path.abspath(os.path.dirname(self.config_file)), p.text))

elif p.tag == 'chapterOnNewPage':
self.profiles[prof_name]['chapterOnNewPage'] = p.text.lower() == 'true'

Expand Down
50 changes: 21 additions & 29 deletions modules/fb2html.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

import os
from lxml import etree, html
import cgi
import re
import shutil
import io
Expand All @@ -12,6 +11,7 @@
import cssutils
import base64
import hashlib
import html

from hyphenator import Hyphenator

Expand Down Expand Up @@ -39,7 +39,6 @@
('&nbsp;', '&#160;'),
('&ensp;', '&#8194;'),
('&emsp;', '&#8195;'),
('&acirc;', '&#226;')
]

def transliterate(string):
Expand Down Expand Up @@ -135,7 +134,7 @@ def save_html(string):
'''

if string:
return cgi.escape(string)
return html.escape(string)
else:
return ''

Expand Down Expand Up @@ -282,12 +281,20 @@ def __init__(self, fb2file, mobifile, tempdir, config):
with codecs.open(fb2file, 'r', 'utf-8') as fin:
fb2_str = fin.read()

# We need to take care of some HTML entities which XML parser will destroy
# rupor - No matter what I do &nbsp &ensp and &emsp are being eaten by XML parser
# and there are probably more of those...
for before, after in HTMLENTITIES:
fb2_str = fb2_str.replace(before, after)

#self.tree = etree.parse(fb2file, parser=etree.XMLParser(recover=True))
self.tree = etree.parse(io.BytesIO(bytes(fb2_str,'utf-8')), parser=etree.XMLParser(recover=True))

if 'xslt' in config.current_profile:
config.log.info(u'Applying XSLT transformations "{0}"'.format(config.current_profile['xslt']))
self.transform = etree.XSLT(etree.parse(config.current_profile['xslt']))
self.tree = self.transform(self.tree)
for entry in self.transform.error_log:
self.log.warning(entry)

self.root = self.tree.getroot()

self.hyphenator = Hyphenator('ru')
Expand Down Expand Up @@ -773,19 +780,11 @@ def parse_format(self, elem, tag = None, css = None, href=None):
self.inline_image_mode = True

if elem.text:
if self.hyphenator and self.hyphenate and not (self.header or self.subheader):
hstring = ' '.join([self.hyphenator.hyphenate_word(w, SOFT_HYPHEN) for w in elem.text.split()])
if elem.text[0].isspace():
hstring = ' ' + hstring
if elem.text[-1].isspace():
hstring += ' '
else:
hstring = elem.text

hs = self.insert_hyphenation(elem.text)
if dodropcaps:
self.buff.append('<span class="dropcaps">%s</span>%s' % (hstring[0], save_html(hstring[1:])))
self.buff.append('<span class="dropcaps">%s</span>%s' % (hs[0], save_html(hs[1:])))
else:
self.buff.append(save_html(hstring))
self.buff.append(save_html(hs))

for e in elem:
if e.tag == etree.Comment:
Expand Down Expand Up @@ -865,7 +864,6 @@ def parse_format(self, elem, tag = None, css = None, href=None):
if elem.tail:
self.buff.append(save_html(self.insert_hyphenation(elem.tail)))


def parse_table_element(self, elem):
self.buff.append('<{0}'.format(ns_tag(elem.tag)))

Expand All @@ -876,22 +874,16 @@ def parse_table_element(self, elem):
self.parse_format(elem)
self.buff.append('</{0}>'.format(ns_tag(elem.tag)))

def insert_hyphenation(self, s):
hs = ''

def insert_hyphenation(self, string):
hstring = ''

if string:
if s:
if self.hyphenator and self.hyphenate and not (self.header or self.subheader):
hstring = ' '.join([self.hyphenator.hyphenate_word(w, SOFT_HYPHEN) for w in string.split()])
if string[0].isspace():
hstring = ' ' + hstring
if string[-1].isspace():
hstring += ' '
hs = ' '.join([self.hyphenator.hyphenate_word(html.unescape(w), SOFT_HYPHEN) for w in s.split(' ')])
else:
hstring = string

return hstring
hs = html.unescape(s)

return hs

def parse_body(self, elem):
self.body_name = elem.attrib['name'] if 'name' in elem.attrib else ''
Expand Down
5 changes: 4 additions & 1 deletion modules/hyphenator.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def __init__(self, lang):
def _init_patterns(self, patterns, exceptions):
for pattern in patterns.split():
self._insert_pattern(pattern)

for ex in exceptions.split():
# Convert the hyphenated pattern into a point array for use later.
self.exceptions[ex.replace('-', '')] = [0] + [ int(h == '-') for h in re.split(r'[\w]', ex, flags=re.U) ]
Expand Down Expand Up @@ -79,6 +79,9 @@ def _hyphenate_word(self, word):
# Short words aren't hyphenated.
if len(word) <= 3:
return [word]
# rupor - just in case HTML entities aren't hyphenated
if word.startswith('&') and word.endswith(';'):
return [word]
# If the word is an exception, get the stored points.
if word.lower() in self.exceptions:
points = self.exceptions[word.lower()]
Expand Down
2 changes: 1 addition & 1 deletion release.cmd
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ IF .%1. == .. GOTO ERR

del fb2mobi_%1.7z >nul
copy dist\fb2mobi.exe .
7z.exe a fb2mobi_%1.7z fb2mobi.exe fb2mobi.config profiles
7z.exe a fb2mobi_%1.7z fb2mobi.exe fb2mobi.config spaces_emsp.xsl profiles
del fb2mobi.exe

goto FIN
Expand Down
33 changes: 33 additions & 0 deletions spaces_emsp.xsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
<?xml version="1.0" encoding="utf-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:fb="http://www.gribuser.ru/xml/fictionbook/2.0">
<xsl:output method="xml" encoding="UTF-8" indent="no"/>

<xsl:template match="node()|@*">
<xsl:copy>
<xsl:apply-templates select="node()|@*"/>
</xsl:copy>
</xsl:template>

<xsl:template match="fb:p">
<xsl:choose>
<xsl:when test="starts-with(.,'– ')">
<xsl:element name="p" namespace="http://www.gribuser.ru/xml/fictionbook/2.0">
<xsl:text disable-output-escaping="yes">–&amp;emsp;</xsl:text>
<xsl:value-of select="substring(.,3)"/>
</xsl:element>
</xsl:when>
<xsl:when test="starts-with(.,'–')">
<xsl:element name="p" namespace="http://www.gribuser.ru/xml/fictionbook/2.0">
<xsl:text disable-output-escaping="yes">–&amp;emsp;</xsl:text>
<xsl:value-of select="substring(.,2)"/>
</xsl:element>
</xsl:when>
<xsl:otherwise>
<xsl:copy>
<xsl:apply-templates/>
</xsl:copy>
</xsl:otherwise>
</xsl:choose>
</xsl:template>

</xsl:stylesheet>
2 changes: 1 addition & 1 deletion version.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
# -*- coding: utf-8 -*-

VERSION = u'2.0.1'
VERSION = u'2.1.0'

0 comments on commit 4796bcd

Please sign in to comment.