Merge pull request #230 from NSoiffer/space-as-attr

Change whitespace handling to use attributes
NSoiffer · Dec 10, 2023 · b52370f · b52370f
2 parents 763d90e + d5c076d
commit b52370f
Show file tree

Hide file tree

Showing 23 changed files with 1,493 additions and 884 deletions.
diff --git a/Rules/Braille/CMU/CMU_Rules.yaml b/Rules/Braille/CMU/CMU_Rules.yaml
@@ -1,6 +1,38 @@
 # References such as 7.2(5) refer to the fifth example in section 7.2 in guide from ONCE
 # https://www.once.es/servicios-sociales/braille/comision-braille-espanola/documentos-tecnicos/documentos-tecnicos-relacionados-con-el-braille/documentos/b5-signografia-matematica.pdf
 ---
+-
+  name: unit-spaces
+  tag: "!*"
+  match: "not(self::m:math) and not($MatchingWhitespace) and (@data-previous-space-width >= 0.25 or @data-following-space-width >= 0.25)"
+  replace:
+   - with:
+      variables: [MatchingWhitespace: "true()"]
+      replace:
+      - test:
+        - if: "(@class='MathML-unit' or contains(@intent, ':unit') or BaseNode(.)[@class='MathML-unit' or contains(@intent, ':unit')]) and @data-previous-space-width >= 0.25" 
+          then: [t: "𝐖"]
+      - x: "."
+      - test:
+        - if: "(@class='MathML-unit' or contains(@intent, ':unit') or BaseNode(.)[@class='MathML-unit' or contains(@intent, ':unit')]) and @data-following-space-width >= 0.25" 
+          then: [t: "𝐖"]
+
+-
+  name: omissions
+  tag: "!*"
+  match: "contains(@intent, ':blank')"
+  replace:
+  - test:
+      - if: "self::m:mo"
+        then: [t: "⠰"]      # 14.5(1)
+      - else_if: "contains(., '\u00A0')"
+        then:   # treated as a number omission -- FIX: anything to look at to increase the odds of it being digits
+        - t: "N"
+        - x: "translate(., '_\u00A0', '⠰')"  # note space is removed
+        else: [t: "⠰⠤⠆"]
+  - t: ""
+
+
 -
   name: default
   tag: msqrt
@@ -201,6 +233,7 @@
     - RowStart: "''"           # empty string -- it needs to be set
     - RowEnd: "''"             # empty string -- it needs to be set
     - NewScriptContext: "''"    # empty string -- it needs to be set
+    - MatchingWhitespace: "false()"
   replace: [x: "*"]
 
 -
@@ -260,40 +293,6 @@
   match: "."
   replace: [x: "*"]
 
--
-  # Non-breaking whitespace is added to the start/end of numbers in cases of adjoining text to help the parse.
-  # These generate a different char inside of a number, so we peel them off here because it doesn't seem possible
-  # to ask "what character position am I inside of an mn?" in unicode.yaml
-  # Here we deal with the messy abnormal case separately -- CMU seems to want to leave out spaces
-  name: peel-off-spaces
-  tag: [mn, mi]
-  match: "starts-with(., '\u00A0') or substring(., string-length(.), 1)='\u00A0'"
-  replace:
-  - test:
-      if: "starts-with(., '\u00A0')"
-      then:
-      - test:
-          if: "substring(., string-length(.), 1)='\u00A0'"
-          then:
-          - x: "BrailleChars(., 'CMU', 2, string-length(.))"
-          - test:
-              if: following-sibling::*[2][@class='MathML-unit' or BaseNode(.)[@class='MathML-unit']]  # '*[2]' to skip invisible times
-              then: [t: "𝐖"]   # BANA 5(a) -- Units are treated as separate exprs
-              else: [t: "W"]
-          else:
-          - x: "BrailleChars(., 'CMU', 2, string-length(.)+1)"
-      else:
-      - test:
-          if: "substring(., string-length(.), 1)='\u00A0'"
-          then:
-          - x: "BrailleChars(., 'CMU', 1, string-length(.))"
-          - test:
-              if: following-sibling::*[2][@class='MathML-unit' or BaseNode(.)[@class='MathML-unit']]  # '*[2]' to skip invisible times
-              then: [t: "𝐖"]   # BANA 5(a) -- Units are treated as separate exprs
-              else: [t: "W"]
-          else:
-          - x: "BrailleChars(., 'CMU', 1, string-length(.)+1)"
-
 -
   name: roman_numeral
   tag: mn
@@ -303,29 +302,6 @@
   - x: "BrailleChars(., 'CMU', 1, 2)"
   - x: "BrailleChars(translate(., 'IVXLCDM', 'ivxlcdm'), 'CMU', 2, string-length(.)+1)"
 
--
-  name: omissions
-  tag: [mi, mtext, mo]
-  match: "contains(@intent, ':omission')"
-  replace:
-  - test:
-      - if: "self::m:mo"
-        then: [t: "⠰"]      # 14.5(1)
-      - else_if: "contains(., '\u00A0')"
-        then:   # treated as a number omission -- FIX: anything to look at to increase the odds of it being digits
-        - t: "N"
-        - x: "translate(., '_\u00A0', '⠰')"  # note space is removed
-        else: [t: "⠰⠤⠆"]
-  - t: ""
-
--
-  name: units
-  tag: [mi, mtext]
-  match: "contains(@intent, ':unit')"
-  replace:
-  - t: "W"
-  - x: "text()"
-
 -
   name: Ln-hack
   tag: mi
@@ -361,7 +337,8 @@
   replace:
   - x: "BrailleChars(., 'CMU')"   # also deals with "dot after 'arc'"
   - test:  # Section 12 says strings longer than one should end with dot 3 rather than restricting to just listed functions (don't do this for mtext)
-      if: "string-length(.) > 1 and translate(., 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', '') = ''"
+      if: "string-length(.) > 1 and translate(., 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', '') = '' and
+           not(@class='MathML-unit' or contains(@intent, ':unit') )"
       then: [t: "."]
 
 
@@ -1133,38 +1110,40 @@
   name: default
   tag: menclose
   match: "."
-  variables:
-  - IsCancellation: "contains(@notation,'updiagonalstrike') or contains(@notation,'downdiagonalstrike') or 
-                     contains(@notation,'verticalstrike') or contains(@notation,'horizontalstrike')"
   replace:
   - test:
-      if: "contains(@notation,'top')"
-      then: [t: "⠈⠉"]                  # overline
-  - test:
-      if: "contains(@notation,'bottom')"
-      then: [t: "⠠⠤"]                  # underline
-  - test:
-      if: "$IsCancellation"
-      then: [t: "⠻"]                   # cancellation
-  - test:  # group anything that isn't a single character or bracketed
-      if: "*[1][(IsNode(., 'leaf') and (string-length(.) = 1 or IsInDefinition(., 'CMUFunctionNames'))) or IsBracketed(., '', '')]"
-      then: [x: "*[1]"]
+      if: "contains(@notation,'box')"
+      then_test: 
+          if: "*[1][self::m:mtext and .='\u00A0']"                  # box and roundedbox
+          then: [t: "⠰⠤⠆"]                            # omission
+          else: [t: "1⠫⠼⠙"]                           # square (no rectangle in UEB)
       else:
-      - t: "⠢"
-      - x: "*[1]"
-      - t: "⠔"
-
+      - with:
+          variables:
+          - IsCancellation: "contains(@notation,'updiagonalstrike') or contains(@notation,'downdiagonalstrike') or 
+                             contains(@notation,'verticalstrike') or contains(@notation,'horizontalstrike')"
+          replace:
+          - test:
+              if: "contains(@notation,'top')"
+              then: [t: "⠈⠉"]                  # overline
+          - test:
+              if: "contains(@notation,'bottom')"
+              then: [t: "⠠⠤"]                  # underline
+          - test:
+              if: "$IsCancellation"
+              then: [t: "⠻"]                   # cancellation
+          - test:
+              if: "*[1][(IsNode(., 'leaf') and (string-length(.) = 1 or IsInDefinition(., 'CMUFunctionNames'))) or IsBracketed(., '', '')]"
+              then: [x: "*[1]"]
+              else:
+              - t: "⠢"
+              - x: "*[1]"
+              - t: "⠔"
+
   #  - test:
   #     if: "contains(concat(' ', normalize-space(@notation), ' '), ' left ')"                  #avoid 'leftarrow'
   #     then: [t: "⠸"]
   #  - test:
-  #     if: "contains(@notation,'box')"                  # box and roundedbox
-  #     then:
-  #     # - test:
-  #     #       if: "$AddSpaces"
-  #     #       then: [t: " "]
-  #     - t: "1⠫⠼⠙"                                  # square (no rectangle in UEB)
-  #  - test:
   #     if: "contains(@notation,'circle')"
   #     then:
   #     # - test:

diff --git a/Rules/Braille/CMU/unicode.yaml b/Rules/Braille/CMU/unicode.yaml
@@ -488,15 +488,12 @@
 #  - "⇣": [t: "1⠳⠂⠩"]             # 0x21E3 (Downwards dashed arrow)
 #  - "⥂": [t: "⠸⠶"]               # 0x2942 (Rightwards arrow above short leftwards arrow (equilibrium, trend to the right))
 #  - "⥄": [t: "⠈⠸⠶"]              # 0x2944 (Short rightwards arrow above leftwards arrow (equilibrium, trend to the left))
-#  - "△": [t: "1⠫⠼⠉t"]            # 0x25B3 (Triangle)
-#  - "□": [t: "1⠫⠼⠙t"]            # 0x25A1 (Square)
 #  - "▤": [t: "⠨⠫⠼⠙t"]            # 0x25A4 (Square with horizontal fill)
 #  - "▥": [t: "⠨⠫⠼⠙t"]            # 0x25A5 (Square with vertical fill)
 #  - "▦": [t: "⠨⠫⠼⠙t"]            # 0x25A6 (Square with orthogonal crosshatch fill)
 #  - "▧": [t: "⠨⠫⠼⠙t"]            # 0x25A7 (Square with upper left to lower right fill)
 #  - "▨": [t: "⠨⠫⠼⠙t"]            # 0x25A8 (Square with upper right to lower left fill)
 #  - "▩": [t: "⠨⠫⠼⠙t"]            # 0x25A9 (Square with diagonal crosshatch fill)
-#  - "○": [t: "1⠫⠿t"]             # 0x25CB (Circle)
 #  - "◍": [t: "⠨⠫⠿t"]             # 0x25CD (Circle with vertical fill)
 #  - "▱": [t: "1⠫⠈⠼⠙t"]           # 0x25B1 (Parallelogram)
 #  - "▲": [t: "⠸⠫⠼⠉t"]            # 0x25B2 (Filled triangle)
@@ -538,15 +535,17 @@
 
  - " ":                        # 0x20 (Space)
      - test:
-        if: "self::m:mn"
-        then: [t: "N⠄"]
-        else: [t: "W"]
+        - if: "self::m:mn"
+          then: [t: "N⠄"]
+        - else_if: "@data-added='missing-content' or @data-empty-in-2D or @width > 1.1"
+          then: [t: "⠰⠤⠆"]       # omission
+        - else_if: "@width < 0.25"
+          then: [t: ""]       # tweaking space -- ignore
+          else: [t: "W"]      # space in text or wide enough space
  - " ":                        # 0xa0 (Non-breaking Space)
      - test:
         - if: "self::m:mn"
           then: [t: "N⠄"]
-        - else_if: "following-sibling::*[1][@class='MathML-unit' or BaseNode(.)[@class='MathML-unit']]"
-          then: [t: "𝐖"]
           else: [t: "W"]
  - ",":                        # 0x2c (Comma)
      - test:
@@ -562,28 +561,17 @@
         # 2. if there is a '.' or other likely block separator after the first '.', then not decimal separator
         # likely other (complicated?) other tests that could be used
         - with:
-            variables: [StringAfter: "substring-after(DEBUG(.), '.')"]
+            variables: [StringAfter: "substring-after(., '.')"]
             replace:
             - test:
-                if: "string-length(DEBUG($StringAfter)) >= 3 and DEBUG(string-length($StringAfter))=string-length( DEBUG(translate($StringAfter, $BlockSeparators, '')) )"
+                if: "string-length($StringAfter) >= 3 and string-length($StringAfter)=string-length( translate($StringAfter, $BlockSeparators, '') )"
                 then: [t: "N⠄"]   # really is a block separator
                 else: [t: "N⠂"]   # treat as decimal separator
         else: [t: "W⠄"]
  - ":": [t: "⠐⠂"]              # 0x003A (Colon)
  - "-": [t: "⠤"]              # 0x2d (Minus sign or hyphen)
  - "⁡": [t: ""]                # 0x2061⁡ (invisible function apply)
-
- - "⁢":                         # 0x2062 (invisible times)
-    - test:
-        if: # GTM 9.3.3 (not very clear in rule, but the function name has to start with a lower case latin char ['no indicators'])
-        - "parent::m:mrow and "
-        - "preceding-sibling::*[1]["
-        - "           (self::m:mi and translate(., 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', '') = '') or "
-        - "           (self::m:mrow and translate(*[last()], 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ', '') = '')] and"
-        - "  following::*[1][self::m:mrow and count(*)=3 and "   # look for function apply
-        - "     *[2][text()='⁡'] and *[1][self::m:mi and translate(., 'abcdefghijklmnopqrstuvwxyz', '') = '']]" 
-        then: [t: "W"]
-        else: [t: ""]
+ - "⁢": [t: ""]                # 0x2062 (invisible times)
  - "⁣": [t: ""]                 # 0x2063⁡ (invisible separator)
  - "⁤": [t: ""]                 # 0x2064 (invisible plus)
 
@@ -602,6 +590,7 @@
  - "↘": [t: "⠡⠂"]                # 0x2198 (flecha oblicua abajo-derecha)
  - "↗": [t: "⠌⠂"]                # 0x2197 (flecha arriba-derecha)
  - "↑": [t: "⠸⠁"]                # 0x2191 (flecha hacia arriba)
+ - "△": [t: "⠠⠾"]               # 0x25B3 (Triangle)
  - "▭": [t: "⠯⠽"]                # 0x25AD (rectángulo)
  - "□": [t: "⠸⠽"]                # 0x25A1 (cuadrado)
  - "⟺": [t: "⠪⠒⠕"]                # 0x27FA (doble implicación «si y solo si»)