Skip to content

Commit

Permalink
Merge pull request #230 from NSoiffer/space-as-attr
Browse files Browse the repository at this point in the history
Change whitespace handling to use attributes
  • Loading branch information
NSoiffer authored Dec 10, 2023
2 parents 763d90e + d5c076d commit b52370f
Show file tree
Hide file tree
Showing 23 changed files with 1,493 additions and 884 deletions.
145 changes: 62 additions & 83 deletions Rules/Braille/CMU/CMU_Rules.yaml
Original file line number Diff line number Diff line change
@@ -1,6 +1,38 @@
# References such as 7.2(5) refer to the fifth example in section 7.2 in guide from ONCE
# https://www.once.es/servicios-sociales/braille/comision-braille-espanola/documentos-tecnicos/documentos-tecnicos-relacionados-con-el-braille/documentos/b5-signografia-matematica.pdf
---
-
name: unit-spaces
tag: "!*"
match: "not(self::m:math) and not($MatchingWhitespace) and (@data-previous-space-width >= 0.25 or @data-following-space-width >= 0.25)"
replace:
- with:
variables: [MatchingWhitespace: "true()"]
replace:
- test:
- if: "(@class='MathML-unit' or contains(@intent, ':unit') or BaseNode(.)[@class='MathML-unit' or contains(@intent, ':unit')]) and @data-previous-space-width >= 0.25"
then: [t: "𝐖"]
- x: "."
- test:
- if: "(@class='MathML-unit' or contains(@intent, ':unit') or BaseNode(.)[@class='MathML-unit' or contains(@intent, ':unit')]) and @data-following-space-width >= 0.25"
then: [t: "𝐖"]

-
name: omissions
tag: "!*"
match: "contains(@intent, ':blank')"
replace:
- test:
- if: "self::m:mo"
then: [t: "⠰"] # 14.5(1)
- else_if: "contains(., '\u00A0')"
then: # treated as a number omission -- FIX: anything to look at to increase the odds of it being digits
- t: "N"
- x: "translate(., '_\u00A0', '⠰')" # note space is removed
else: [t: "⠰⠤⠆"]
- t: ""


-
name: default
tag: msqrt
Expand Down Expand Up @@ -201,6 +233,7 @@
- RowStart: "''" # empty string -- it needs to be set
- RowEnd: "''" # empty string -- it needs to be set
- NewScriptContext: "''" # empty string -- it needs to be set
- MatchingWhitespace: "false()"
replace: [x: "*"]

-
Expand Down Expand Up @@ -260,40 +293,6 @@
match: "."
replace: [x: "*"]

-
# Non-breaking whitespace is added to the start/end of numbers in cases of adjoining text to help the parse.
# These generate a different char inside of a number, so we peel them off here because it doesn't seem possible
# to ask "what character position am I inside of an mn?" in unicode.yaml
# Here we deal with the messy abnormal case separately -- CMU seems to want to leave out spaces
name: peel-off-spaces
tag: [mn, mi]
match: "starts-with(., '\u00A0') or substring(., string-length(.), 1)='\u00A0'"
replace:
- test:
if: "starts-with(., '\u00A0')"
then:
- test:
if: "substring(., string-length(.), 1)='\u00A0'"
then:
- x: "BrailleChars(., 'CMU', 2, string-length(.))"
- test:
if: following-sibling::*[2][@class='MathML-unit' or BaseNode(.)[@class='MathML-unit']] # '*[2]' to skip invisible times
then: [t: "𝐖"] # BANA 5(a) -- Units are treated as separate exprs
else: [t: "W"]
else:
- x: "BrailleChars(., 'CMU', 2, string-length(.)+1)"
else:
- test:
if: "substring(., string-length(.), 1)='\u00A0'"
then:
- x: "BrailleChars(., 'CMU', 1, string-length(.))"
- test:
if: following-sibling::*[2][@class='MathML-unit' or BaseNode(.)[@class='MathML-unit']] # '*[2]' to skip invisible times
then: [t: "𝐖"] # BANA 5(a) -- Units are treated as separate exprs
else: [t: "W"]
else:
- x: "BrailleChars(., 'CMU', 1, string-length(.)+1)"

-
name: roman_numeral
tag: mn
Expand All @@ -303,29 +302,6 @@
- x: "BrailleChars(., 'CMU', 1, 2)"
- x: "BrailleChars(translate(., 'IVXLCDM', 'ivxlcdm'), 'CMU', 2, string-length(.)+1)"

-
name: omissions
tag: [mi, mtext, mo]
match: "contains(@intent, ':omission')"
replace:
- test:
- if: "self::m:mo"
then: [t: "⠰"] # 14.5(1)
- else_if: "contains(., '\u00A0')"
then: # treated as a number omission -- FIX: anything to look at to increase the odds of it being digits
- t: "N"
- x: "translate(., '_\u00A0', '⠰')" # note space is removed
else: [t: "⠰⠤⠆"]
- t: ""

-
name: units
tag: [mi, mtext]
match: "contains(@intent, ':unit')"
replace:
- t: "W"
- x: "text()"

-
name: Ln-hack
tag: mi
Expand Down Expand Up @@ -361,7 +337,8 @@
replace:
- x: "BrailleChars(., 'CMU')" # also deals with "dot after 'arc'"
- test: # Section 12 says strings longer than one should end with dot 3 rather than restricting to just listed functions (don't do this for mtext)
if: "string-length(.) > 1 and translate(., 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', '') = ''"
if: "string-length(.) > 1 and translate(., 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', '') = '' and
not(@class='MathML-unit' or contains(@intent, ':unit') )"
then: [t: "."]


Expand Down Expand Up @@ -1133,38 +1110,40 @@
name: default
tag: menclose
match: "."
variables:
- IsCancellation: "contains(@notation,'updiagonalstrike') or contains(@notation,'downdiagonalstrike') or
contains(@notation,'verticalstrike') or contains(@notation,'horizontalstrike')"
replace:
- test:
if: "contains(@notation,'top')"
then: [t: "⠈⠉"] # overline
- test:
if: "contains(@notation,'bottom')"
then: [t: "⠠⠤"] # underline
- test:
if: "$IsCancellation"
then: [t: "⠻"] # cancellation
- test: # group anything that isn't a single character or bracketed
if: "*[1][(IsNode(., 'leaf') and (string-length(.) = 1 or IsInDefinition(., 'CMUFunctionNames'))) or IsBracketed(., '', '')]"
then: [x: "*[1]"]
if: "contains(@notation,'box')"
then_test:
if: "*[1][self::m:mtext and .='\u00A0']" # box and roundedbox
then: [t: "⠰⠤⠆"] # omission
else: [t: "1⠫⠼⠙"] # square (no rectangle in UEB)
else:
- t: ""
- x: "*[1]"
- t: ""

- with:
variables:
- IsCancellation: "contains(@notation,'updiagonalstrike') or contains(@notation,'downdiagonalstrike') or
contains(@notation,'verticalstrike') or contains(@notation,'horizontalstrike')"
replace:
- test:
if: "contains(@notation,'top')"
then: [t: "⠈⠉"] # overline
- test:
if: "contains(@notation,'bottom')"
then: [t: "⠠⠤"] # underline
- test:
if: "$IsCancellation"
then: [t: "⠻"] # cancellation
- test:
if: "*[1][(IsNode(., 'leaf') and (string-length(.) = 1 or IsInDefinition(., 'CMUFunctionNames'))) or IsBracketed(., '', '')]"
then: [x: "*[1]"]
else:
- t: ""
- x: "*[1]"
- t: ""

# - test:
# if: "contains(concat(' ', normalize-space(@notation), ' '), ' left ')" #avoid 'leftarrow'
# then: [t: "⠸"]
# - test:
# if: "contains(@notation,'box')" # box and roundedbox
# then:
# # - test:
# # if: "$AddSpaces"
# # then: [t: " "]
# - t: "1⠫⠼⠙" # square (no rectangle in UEB)
# - test:
# if: "contains(@notation,'circle')"
# then:
# # - test:
Expand Down
33 changes: 11 additions & 22 deletions Rules/Braille/CMU/unicode.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -488,15 +488,12 @@
# - "⇣": [t: "1⠳⠂⠩"] # 0x21E3 (Downwards dashed arrow)
# - "⥂": [t: "⠸⠶"] # 0x2942 (Rightwards arrow above short leftwards arrow (equilibrium, trend to the right))
# - "⥄": [t: "⠈⠸⠶"] # 0x2944 (Short rightwards arrow above leftwards arrow (equilibrium, trend to the left))
# - "△": [t: "1⠫⠼⠉t"] # 0x25B3 (Triangle)
# - "□": [t: "1⠫⠼⠙t"] # 0x25A1 (Square)
# - "▤": [t: "⠨⠫⠼⠙t"] # 0x25A4 (Square with horizontal fill)
# - "▥": [t: "⠨⠫⠼⠙t"] # 0x25A5 (Square with vertical fill)
# - "▦": [t: "⠨⠫⠼⠙t"] # 0x25A6 (Square with orthogonal crosshatch fill)
# - "▧": [t: "⠨⠫⠼⠙t"] # 0x25A7 (Square with upper left to lower right fill)
# - "▨": [t: "⠨⠫⠼⠙t"] # 0x25A8 (Square with upper right to lower left fill)
# - "▩": [t: "⠨⠫⠼⠙t"] # 0x25A9 (Square with diagonal crosshatch fill)
# - "○": [t: "1⠫⠿t"] # 0x25CB (Circle)
# - "◍": [t: "⠨⠫⠿t"] # 0x25CD (Circle with vertical fill)
# - "▱": [t: "1⠫⠈⠼⠙t"] # 0x25B1 (Parallelogram)
# - "▲": [t: "⠸⠫⠼⠉t"] # 0x25B2 (Filled triangle)
Expand Down Expand Up @@ -538,15 +535,17 @@

- " ": # 0x20 (Space)
- test:
if: "self::m:mn"
then: [t: "N⠄"]
else: [t: "W"]
- if: "self::m:mn"
then: [t: "N⠄"]
- else_if: "@data-added='missing-content' or @data-empty-in-2D or @width > 1.1"
then: [t: "⠰⠤⠆"] # omission
- else_if: "@width < 0.25"
then: [t: ""] # tweaking space -- ignore
else: [t: "W"] # space in text or wide enough space
- " ": # 0xa0 (Non-breaking Space)
- test:
- if: "self::m:mn"
then: [t: "N⠄"]
- else_if: "following-sibling::*[1][@class='MathML-unit' or BaseNode(.)[@class='MathML-unit']]"
then: [t: "𝐖"]
else: [t: "W"]
- ",": # 0x2c (Comma)
- test:
Expand All @@ -562,28 +561,17 @@
# 2. if there is a '.' or other likely block separator after the first '.', then not decimal separator
# likely other (complicated?) other tests that could be used
- with:
variables: [StringAfter: "substring-after(DEBUG(.), '.')"]
variables: [StringAfter: "substring-after(., '.')"]
replace:
- test:
if: "string-length(DEBUG($StringAfter)) >= 3 and DEBUG(string-length($StringAfter))=string-length( DEBUG(translate($StringAfter, $BlockSeparators, '')) )"
if: "string-length($StringAfter) >= 3 and string-length($StringAfter)=string-length( translate($StringAfter, $BlockSeparators, '') )"
then: [t: "N⠄"] # really is a block separator
else: [t: "N⠂"] # treat as decimal separator
else: [t: "W⠄"]
- ":": [t: "⠐⠂"] # 0x003A (Colon)
- "-": [t: "⠤"] # 0x2d (Minus sign or hyphen)
- "": [t: ""] # 0x2061⁡ (invisible function apply)

- "": # 0x2062 (invisible times)
- test:
if: # GTM 9.3.3 (not very clear in rule, but the function name has to start with a lower case latin char ['no indicators'])
- "parent::m:mrow and "
- "preceding-sibling::*[1]["
- " (self::m:mi and translate(., 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', '') = '') or "
- " (self::m:mrow and translate(*[last()], 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', '') = '')] and"
- " following::*[1][self::m:mrow and count(*)=3 and " # look for function apply
- " *[2][text()='⁡'] and *[1][self::m:mi and translate(., 'abcdefghijklmnopqrstuvwxyz', '') = '']]"
then: [t: "W"]
else: [t: ""]
- "": [t: ""] # 0x2062 (invisible times)
- "": [t: ""] # 0x2063⁡ (invisible separator)
- "": [t: ""] # 0x2064 (invisible plus)

Expand All @@ -602,6 +590,7 @@
- "": [t: "⠡⠂"] # 0x2198 (flecha oblicua abajo-derecha)
- "": [t: "⠌⠂"] # 0x2197 (flecha arriba-derecha)
- "": [t: "⠸⠁"] # 0x2191 (flecha hacia arriba)
- "": [t: "⠠⠾"] # 0x25B3 (Triangle)
- "": [t: "⠯⠽"] # 0x25AD (rectángulo)
- "": [t: "⠸⠽"] # 0x25A1 (cuadrado)
- "": [t: "⠪⠒⠕"] # 0x27FA (doble implicación «si y solo si»)
Expand Down
Loading

0 comments on commit b52370f

Please sign in to comment.