Added optional ability to apply xslt transformation to fb2 - after some

html entities are encoded but before anything else. Changed how strings are splitted into words during hyphenation - better whitespaces handling. Added unescaping of text strings to allow proper processing of the XSLT transformation results.
rupor-github · Feb 17, 2015 · 4796bcd · 4796bcd
1 parent f85f31c
commit 4796bcd
Show file tree

Hide file tree

Showing 8 changed files with 71 additions and 32 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,6 @@
 # vim
 *.swp
+*_clip.txt
 
 # releases
 *.7z

diff --git a/fb2mobi.py b/fb2mobi.py
@@ -124,6 +124,10 @@ def process_file(config, infile, outfile=None):
     if not config.current_profile['css']:
         config.log.warning(u'Profile does not have link to css file.')
 
+    if 'xslt' in config.current_profile and not os.path.exists(config.current_profile['xslt']):
+        config.log.critical(u'Transformation file {0} not found'.format(config.current_profile['xslt']))
+        return
+
     if config.kindle_compression_level < 0 or config.kindle_compression_level > 2:
         config.log.warning(u'Parameter kindleCompressionLevel should be between 0 and 2, using default value (1).')
         config.kindle_compression_level = 1
@@ -402,6 +406,8 @@ def process(args):
             config.kindle_compression_level = args.kindlecompressionlevel
         if args.css:
             config.current_profile['css'] = args.css
+        if args.xslt:
+            config.current_profile['xslt'] = args.xslt
         if args.dropcaps is not None:
             config.current_profile['dropcaps'] = args.dropcaps
         if args.tocmaxlevel:
@@ -488,6 +494,7 @@ def process(args):
 
     argparser.add_argument('-p', '--profile', type=str, default=None, help=u'Profile name from configuration')
     argparser.add_argument('--css', type=str, default=None, help=u'css file name')
+    argparser.add_argument('--xslt', type=str, default=None, help=u'xslt file name')
     dropcaps_group = argparser.add_mutually_exclusive_group()
     dropcaps_group.add_argument('--dropcaps', dest='dropcaps', action='store_true', default=None, help=u'Use dropcaps')
     dropcaps_group.add_argument('--no-dropcaps', dest='dropcaps', action='store_false', default=None, help=u'Do not use dropcaps')

diff --git a/modules/config.py b/modules/config.py
@@ -164,6 +164,9 @@ def _load(self):
                             else:
                                 self.profiles[prof_name]['parse_css'] = True
 
+                        elif p.tag == 'xslt':
+                            self.profiles[prof_name]['xslt'] = os.path.abspath(os.path.join(os.path.abspath(os.path.dirname(self.config_file)), p.text))
+
                         elif p.tag == 'chapterOnNewPage':
                             self.profiles[prof_name]['chapterOnNewPage'] = p.text.lower() == 'true'
 

diff --git a/modules/fb2html.py b/modules/fb2html.py
@@ -3,7 +3,6 @@
 
 import os
 from lxml import etree, html
-import cgi
 import re
 import shutil
 import io
@@ -12,6 +11,7 @@
 import cssutils
 import base64
 import hashlib
+import html
 
 from hyphenator import Hyphenator
 
@@ -39,7 +39,6 @@
     ('&nbsp;',  '&#160;'),
     ('&ensp;',  '&#8194;'),
     ('&emsp;',  '&#8195;'),
-    ('&acirc;', '&#226;')
     ]
 
 def transliterate(string):
@@ -135,7 +134,7 @@ def save_html(string):
     '''
 
     if string:
-        return cgi.escape(string)
+        return html.escape(string)
     else:
         return ''
 
@@ -282,12 +281,20 @@ def __init__(self, fb2file, mobifile, tempdir, config):
         with codecs.open(fb2file, 'r', 'utf-8') as fin:
             fb2_str = fin.read()
 
-        # We need to take care of some HTML entities which XML parser will destroy
+        # rupor - No matter what I do &nbsp &ensp and &emsp are being eaten by XML parser
+        #         and there are probably more of those...
         for before, after in HTMLENTITIES:
             fb2_str = fb2_str.replace(before, after)
 
-        #self.tree = etree.parse(fb2file, parser=etree.XMLParser(recover=True))
         self.tree = etree.parse(io.BytesIO(bytes(fb2_str,'utf-8')), parser=etree.XMLParser(recover=True))
+
+        if 'xslt' in config.current_profile:
+            config.log.info(u'Applying XSLT transformations "{0}"'.format(config.current_profile['xslt']))
+            self.transform = etree.XSLT(etree.parse(config.current_profile['xslt']))
+            self.tree = self.transform(self.tree)
+            for entry in self.transform.error_log:
+                self.log.warning(entry)
+
         self.root = self.tree.getroot()
 
         self.hyphenator = Hyphenator('ru')
@@ -773,19 +780,11 @@ def parse_format(self, elem, tag = None, css = None, href=None):
                 self.inline_image_mode = True
 
         if elem.text:
-            if self.hyphenator and self.hyphenate and not (self.header or self.subheader):
-                hstring = ' '.join([self.hyphenator.hyphenate_word(w, SOFT_HYPHEN) for w in elem.text.split()])
-                if elem.text[0].isspace():
-                    hstring = ' ' + hstring
-                if elem.text[-1].isspace():
-                    hstring += ' '
-            else:
-                hstring = elem.text
-
+            hs = self.insert_hyphenation(elem.text)
             if dodropcaps:
-                self.buff.append('<span class="dropcaps">%s</span>%s' % (hstring[0], save_html(hstring[1:])))
+                self.buff.append('<span class="dropcaps">%s</span>%s' % (hs[0], save_html(hs[1:])))
             else:
-                self.buff.append(save_html(hstring))
+                self.buff.append(save_html(hs))
 
         for e in elem:
             if e.tag == etree.Comment:
@@ -865,7 +864,6 @@ def parse_format(self, elem, tag = None, css = None, href=None):
         if elem.tail:
             self.buff.append(save_html(self.insert_hyphenation(elem.tail)))
 
-
     def parse_table_element(self, elem):
         self.buff.append('<{0}'.format(ns_tag(elem.tag)))
 
@@ -876,22 +874,16 @@ def parse_table_element(self, elem):
         self.parse_format(elem)
         self.buff.append('</{0}>'.format(ns_tag(elem.tag)))
 
+    def insert_hyphenation(self, s):
+        hs = ''
 
-    def insert_hyphenation(self, string):
-        hstring = ''
-
-        if string:
+        if s:
             if self.hyphenator and self.hyphenate and not (self.header or self.subheader):
-                hstring = ' '.join([self.hyphenator.hyphenate_word(w, SOFT_HYPHEN) for w in string.split()])
-                if string[0].isspace():
-                    hstring = ' ' + hstring
-                if string[-1].isspace():
-                    hstring += ' '
+                hs = ' '.join([self.hyphenator.hyphenate_word(html.unescape(w), SOFT_HYPHEN) for w in s.split(' ')])
             else:
-                hstring = string
-
-        return hstring
+                hs = html.unescape(s)
 
+        return hs
 
     def parse_body(self, elem):
         self.body_name = elem.attrib['name'] if 'name' in elem.attrib else ''

diff --git a/modules/hyphenator.py b/modules/hyphenator.py
@@ -30,7 +30,7 @@ def __init__(self, lang):
     def _init_patterns(self, patterns, exceptions):
         for pattern in patterns.split():
             self._insert_pattern(pattern)
-        
+
         for ex in exceptions.split():
             # Convert the hyphenated pattern into a point array for use later.
             self.exceptions[ex.replace('-', '')] = [0] + [ int(h == '-') for h in re.split(r'[\w]', ex, flags=re.U) ]
@@ -79,6 +79,9 @@ def _hyphenate_word(self, word):
         # Short words aren't hyphenated.
         if len(word) <= 3:
             return [word]
+        # rupor - just in case HTML entities aren't hyphenated
+        if word.startswith('&') and word.endswith(';'):
+            return [word]
         # If the word is an exception, get the stored points.
         if word.lower() in self.exceptions:
             points = self.exceptions[word.lower()]

diff --git a/release.cmd b/release.cmd
@@ -2,7 +2,7 @@ IF .%1. == .. GOTO ERR
 
 del fb2mobi_%1.7z >nul
 copy dist\fb2mobi.exe .
-7z.exe a fb2mobi_%1.7z fb2mobi.exe fb2mobi.config profiles
+7z.exe a fb2mobi_%1.7z fb2mobi.exe fb2mobi.config spaces_emsp.xsl profiles
 del fb2mobi.exe
 
 goto FIN

diff --git a/spaces_emsp.xsl b/spaces_emsp.xsl
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="utf-8"?>
+<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform" xmlns:fb="http://www.gribuser.ru/xml/fictionbook/2.0">
+	<xsl:output method="xml" encoding="UTF-8" indent="no"/>
+
+	<xsl:template match="node()|@*">
+		<xsl:copy>
+			<xsl:apply-templates select="node()|@*"/>
+		</xsl:copy>
+	</xsl:template>
+
+	<xsl:template match="fb:p">
+		<xsl:choose>
+			<xsl:when test="starts-with(.,'– ')">
+				<xsl:element name="p" namespace="http://www.gribuser.ru/xml/fictionbook/2.0">
+					<xsl:text disable-output-escaping="yes">–&amp;emsp;</xsl:text>
+					<xsl:value-of select="substring(.,3)"/>
+				</xsl:element>
+			</xsl:when>
+			<xsl:when test="starts-with(.,'–')">
+				<xsl:element name="p" namespace="http://www.gribuser.ru/xml/fictionbook/2.0">
+					<xsl:text disable-output-escaping="yes">–&amp;emsp;</xsl:text>
+					<xsl:value-of select="substring(.,2)"/>
+				</xsl:element>
+			</xsl:when>
+			<xsl:otherwise>
+				<xsl:copy>
+					<xsl:apply-templates/>
+				</xsl:copy>
+			</xsl:otherwise>
+		</xsl:choose>
+	</xsl:template>
+
+</xsl:stylesheet>
diff --git a/version.py b/version.py
@@ -1,3 +1,3 @@
 # -*- coding: utf-8 -*- 
 
-VERSION = u'2.0.1'
+VERSION = u'2.1.0'