-
Notifications
You must be signed in to change notification settings - Fork 16
/
Copy path$$.Collator.jsxlib
1932 lines (1645 loc) · 75.4 KB
/
$$.Collator.jsxlib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/*******************************************************************************
Name: Collator
Desc: Simplified version of the Unicode Collation Algorithm (UCA)
Path: /etc/$$.Collator.jsxlib
Require: ScriptUI.builder (selectLanguage)
Encoding: ÛȚF8
Core: NO
Kind: Module
API: =sort() setTailor() getTailor() findTailor()
getRichList() selectLanguage()
getLocaleKey() localeSort() baseKey()
DOM-access: NO
Todo: testing tailored languages ; cf www.learnpunjabi.org/pdf/PunjabiSorting.pdf
Created: 151228 (YYMMDD)
Modified: 241204 (YYMMDD)
*******************************************************************************/
;$$.hasOwnProperty('Collator') || eval(__(MODULE, $$, 'Collator', 241204, 'sort'))
//==========================================================================
// BACKGROUND
//==========================================================================
/*
0. OVERVIEW
____________________________________________________________________________
This module implements a light-weight, simplified, optimized version of the
Unicode Collation Algorithm (UCA). It allows you to sort strings according
to a specific language and with respect to the UCA rules, in ExtendScript.
It supports three comparison levels:
L1 Base characters role < roles < rule
L2 Accents role < rôle < roles
L3 Case/Variants role < Role < rôle
and assigns a default weight to about 10,000 characters or n-grams, referred
to as KEYS. These keys only reflect a subset of the Default Unicode Collation
Element Table (DUCET) -- http://unicode.org/Public/UCA/latest/allkeys.txt --
which contains 35,000+ items.
Collator makes it possible to globally address LATIN, GREEK, CYRILLIC, ARABIC, or
HEBREW scripts. It also targets ARMENIAN, BENGALI, DEVANAGARI, LAO, MALAYALAM,
TAMIL, TELUGU, and THAI writing systems, as well as most ALPHANUMERIC and
PUNCTUATION symbols, including DIACRITICAL marks, LETTERLIKE symbols, NUMBER
forms, SUPERSCRIPTS and SUBSCRIPTS, and many symbols/dingbats/shapes that might
be involved in sorting (arrows, IPA, technical and mathematical symbols, etc.)
[NOTE] For a full list of supported Unicode blocks, see /etc/MetaCollator/~.KEEP.
The original DUCET elements that have been removed are, for the most part, CJK-
related characters and old script/language components. A distinct implementation
should be designed to handle these characters. *If you need to sort CJK strings,
do not use the present module.*
1. BASICS
____________________________________________________________________________
To any supported key (character or n-gram, including surrogate pairs such as
`\uD82F\uDCA0`), Collator assigns a Weight Sequence (WS). A WS is a string in
the form S1,S2,... where each S_i represents a Weight Code in base 36:
WeightCode_i = parseInt(S_i, 36).
This specific encoding allows to reduce the size of the literal map. Weights are
ordered as specified in the DUCET but they use a smaller range of (L1,L2,L3)
values, so the actual weight code of any element is coerced into a single
uint32 that JavaScript can digest and manipulate through bitwise operators.
[NOTE] The KEY-to-WS map is stored in /etc/Collator/$$.WMAP.jsxres.
It can be rebuilt from scratch using the MetaCollator module.
Each Weight Code (uint32) encodes inner weights as follows:
1111 1111 1111 1xxx 2222 222y 3333 3zzV
└─────────────────┘ └───────┘ └──────┘╰──Variable-Bit
L1 L2 L3
13+3 bits 7+1 bits 4+2 bits
where
L1 (16bits) reflects a level-1 weight (BASE)
L2 (8bits) reflects a level-2 weight (DIACRITICS)
L3 (6bits) reflects a level-3 weight (CASE)
V (1bit flag) tells whether the code concerns a 'variable' element
(see below.)
`xxx`, `y` and `zz` are reserved bits (set to zero in WMAP) used in TAILORED
languages which need specific reordering. The subfolder /etc/Collation/tailoring
contains 50+ resource files that extend the default rules to specific languages
or language sets.
For example, the European Ordering Rules (EOR) are defined in the file
`/etc/Collation/tailoring/$$.EOR.jsxres`
which adds L1 and/or L2 extra bits for addressing keys like
`Æ`, `æ`, `IJ`, `ij`, `Œ`, `œ`, `ß`, etc.
[NOTE] The Minimal Weight Code (assigned to the TAB character in ~.WMAP)
is `b8y1` in base 36, i.e 524809 (decimal), that is b10000000001000001001:
0000 0000 0000 1000 0000 0010 0000 1001
└─────────────────┘ └───────┘ └──────┘╰──Variable-Bit
L1 L2 L3
This number determines:
- the minimal L1 16-bit value as b1000 (8)
- the minimal L2 8-bit value as b10 (2)
- the minimal L3 7-bit value as b100 (4)
3. VARIABLE WEIGHTING
____________________________________________________________________________
"Variable collation elements, which typically include punctuation characters
and which may or may not include a subset of symbol characters, require
special handling in the UCA."
[REF] http://unicode.org/reports/tr10/#Variable_Weighting
The present implementation provides two variable-weighting options:
(a) NON-IGNORABLE: Variable collation elements are "not reset to be quaternary
collation elements", that is, their weight values just behave as specified
in the map like any other regular collation element.
For example, it comes
'a b' <<< 'a B' < 'a-b' <<< 'a-B' < 'ab'
since*
[.1E89.0020.0002],[*0209.0020.0002],[.1EA3.0020.0002] ; 'a b'
<<< ^^^^
[.1E89.0020.0002],[*0209.0020.0002],[.1EA3.0020.0008] ; 'a B'
< ^^^^
[.1E89.0020.0002],[*020D.0020.0002],[.1EA3.0020.0002] ; 'a-b'
<<< ^^^^
[.1E89.0020.0002],[*020D.0020.0002],[.1EA3.0020.0008] ; 'a-B'
< ^^^^
[.1E89.0020.0002],[.1EA3.0020.0002] ; 'ab'
* As specified in UCA's Main Algorithm -- unicode.org/reports/tr10/
tr10-41.html#Main_Algorithm -- the actual sort keys to be compared are
in fact <L1_Weights>\0<L2_Weights>\0<L3_Weights>, that is
[1E89,0209,1EA3]\0[0020,0020,0020]\0[0002,0002,0002] for 'a b'
[1E89,0209,1EA3]\0[0020,0020,0020]\0[0002,0002,0008] for 'a B'
etc
"The UCA uses the value zero (0000) for the level separator, to guarantee
that the level separator has a lower value than any of the actual
collation weights appended to the sort key from the collation element
array. Implementations can, however, use a non-zero value, as long as
that value is lower than the minimum weight at that level."
(b) BLANKED: "Variable collation elements and any subsequent ignorable
collation elements are reset so that all weights (...) are zero."
For example, it comes
'a b' === 'a-b' === 'ab' <<< 'a B' === 'a-B'
since*
[.1E89.0020.0002]<0>[.1EA3.0020.0002] ; 'a b'
===
[.1E89.0020.0002]<0>[.1EA3.0020.0002] ; 'a-b'
===
[.1E89.0020.0002]<0>[.1EA3.0020.0002] ; 'ab'
<<< ^^^^
[.1E89.0020.0002]<0>[.1EA3.0020.0008] ; 'a B'
===
[.1E89.0020.0002]<0>[.1EA3.0020.0008] ; 'a-B'
* See (a) note.
[NOTE] The SHIFTED and SHIFT-TRIMMED options are not implemented in Collator.
4. AVOIDING NORMALIZATION
____________________________________________________________________________
A conformant implementation of the UCA should, as a first step, convert the
input string into Normalization Form D (NFD), as detailed in
https://unicode.org/reports/tr15/
However,
"Conformant implementations must get the same results as the UCA, but such
implementations may use different techniques to get those results, usually
with the goal of achieving better performance. For example, an implementation
may be able to avoid normalizing most, if not all, of an input string in
[NFD conversion] of the algorithm.
In a straightforward implementation of the algorithm, canonically decomposable
characters do not require mappings to collation elements because [NFD
conversion] decomposes them, so they do not occur in any of the following
algorithm steps and thus are irrelevant for the collation elements lookup.
For example, there need not be a mapping for “ü” because it is always
decomposed to the sequence “u + ◌̈”.
In an optimized implementation, a canonically decomposable character like
“ü” may map directly to the sequence of collation elements for the
decomposition (“ü” → CE(u)CE(◌̈), unless there is a contraction defined for
that sequence). For most input strings, these mappings can be used directly
for correct results, rather than first having to normalize the text.
While such an approach can lead to significantly improved performance, there
are various issues that need to be handled, including but not limited to the
following:
- Typically, the easiest way to manage the data is to add mappings for each
of the canonically equivalent strings, the so-called “canonical closure”.
Thus, each of {ǭ, ǫ + ̄ , ō + ̨ , o+ ̄ + ̨ , o+ ̨, + ̄ } can map to the
same collation elements.
- These collation elements must be in the same order as if the characters
were decomposed using Normalization Form D.
- The easiest approach is to detect sequences that are in the format known
as “Fast C or D form” (FCD*), and to directly look up collation elements
for characters in such FCD sequences, without normalizing them.
* Canonical Equivalence in Applications: http://unicode.org/notes/tn5/
- In any difficult cases, such as if a sequence is not in FCD form, or when
there are contractions that cross sequence boundaries, the algorithm can
fall back to doing a full NFD normalization."
In the present implementation, input strings are *assumed* to be supplied in
FCD form and canonical equivalences are treated straight in `WMAP`. For
example, the grapheme “ü” is registered as:
"\u00FC" => "x8gmiw,4qw", // 3833.1.1 0.12.1
which also matches the values found in the canonical decomposition:
"\u0075" => "x8gmiw", // 3833.1.1
"\u0308" => "4qw", // 0.12.1
Thus, the result of processing the weight sequence is equivalent as far as
no tailoring is involved for the grapheme “ü”.
Now, should the grapheme “ü” require tailoring (as in Danish: `Y << ü`),
special rules have to be specified for both the composed key (\u00FC) and
its decomposed form "\u0075\u0308". Technically, the default weight sequences
of the individual keys "\u0075" and "\u0308" won't change, but
i. The key "\u00FC" has to be overridden: "\u00FC" => <new_WS>
ii. A new key "\u0075\u0308" has to be added: "\u0075\u0308" => <new_WS>
Those tailoring rules are treated and encoded from MetaCollator and made
available to Collator from the ~.TLRM resource. (See next section for detail.)
Usual decompositions are automatically added by MetaCollator while parsing the
tailoring rules.
5. TAILORING
____________________________________________________________________________
"Tailoring consists of any well-defined change in the Collation Element Table
and/or any well-defined change in the behavior of the algorithm. Typically,
a tailoring is expressed by means of a formal syntax which allows detailed
manipulation of values in a Collation Element Table (...) A tailoring can
be used to provide linguistically-accurate collation, if desired."
[REF] http://unicode.org/reports/tr35/tr35-collation.html#Rules
Collator implements the main tailoring procedure, that is,
"Reordering any character or contraction with respect
to others in the default ordering."
The reordering can represent a L1 difference ( A < B ), L2 difference
( A << À ), or L3 difference ( a <<< A ).
E.g in Breton: C < ch <<< Ch <<< CH
< c'h <<< C'h <<< C'H
The map ~.TLRM provides, for particular language keys like `br_BR` or `es_ES`,
a sequence of RULES that specify such reordering. In Collator's syntax the
usual operators <, <<, and <<< are changed into `>1`, `>2`, `>3`. Then the
above rules will be encoded
br_BR:
{
. . .
"ch" : ">1C", "Ch" : ">3ch", "CH" : ">3Ch",
"c'h" : ">1CH", "C'h" : ">3c'h", "C'H" : ">3C'h",
. . .
}
The first key asserts that ch > C, in other words, the digram “ch” must have a
L1-weight greater than that of “C”. Technically, when `µ.setTailor()` is
invoked for selecting particular tailoring rules, Collator consumes the extra
bits `xxx`, `y` and/or `zz` allowed in weight encoding:
1111 1111 1111 1xxx 2222 222y 3333 3zzV
└─────────────────┘ └───────┘ └──────┘╰──Var
L1 L2 L3
The rule "ch"=>">1C", for example, increments the `xxx` part of the L1 area to
make room for the bigram “ch”. It is therefore possible to insert up to seven L1
additional weights at any point of the default collation map.
When a L1 weight is inserted for tailoring purpose, the L2_L3_V bits are reset
to zero. So the result of tailoring the “ch” bigram looks like
<default_C_bits>001 0000 0000 0000 0000
└─────────────────┘ └───────┘ └──────┘╰──Var
L1 L2 L3
On the other hand, *native* L2 weights only offer a single extra bit (`y`) of
differenciation. This is justified by the fact that adding several L2 weights
between the default values is unfrequent, so it is assumed in the present
implementation that such event rarely occurs and can be treated by simply
increasing L2 as a whole when the `y` bit is already set. That `y` extra bit
is useful though. For example, the Estonian key (et_EE) has a rule
V << w <<< W
which specifies that “w” is ordered as V/v at the primary level (L1) while
introducing a L2 difference, then w <<< W is maintained at the third level.
In Collator's syntax,
"w" => ">2V" // V << w
"W" => ">3w" // w <<< W
It comes that the extra L2 bit is consumed:
w :: <default_V_bits>000 <def_L2>1 0000 0000
└─────────────────┘ └───────┘ └──────┘╰──Var
L1 L2 L3
and
W :: <default_V_bits>000 <def_L2>1 0000 0010
└─────────────────┘ └───────┘ └──────┘╰──Var
L1 L2 L3
Note that since the rule "w"=>">2V" comes first and redefines w's weights,
the next rule "W" => ">3w" resets W's weight with respect to the new value
assigned to w. So order matters when encoding tailoring resources.
(See /etc/MetaCollator/tailoring.)
[NOTE] Collator does not support fine-tuned operations like "Setting the
secondary level to be backwards or forwards" or "Customizing the exact list
of variable collation elements". Also, when a `[beforeN]` operation is
required, it has to be rephrased to fit our `>N...` syntax.
6. MAIN ALPHABETS
____________________________________________________________________________
By default (ROOT), Latin, Greek, and Cyrillic alphabets are ordered as
specified in the DUCET:
LATIN
Aa, Bb, Cc, Dd, Ee, Əə, Ff, Gg, Hh, Ii, ı, Jj, Kk, Ll, Mm, Nn,
Ŋŋ, Oo, Pp, Qq, ĸ, Rr, Ss, Tt, Ŧŧ, Uu, Vv, Ww, Xx, Yy, Zz, Þþ
GREEK
Αα, Ββ, Γγ, Δδ, Εε, Ζζ, Ηη, Θθ, Ιι, Κκ, Λλ, Μμ, Νν, Ξξ, Οο, Ππ,
Ρρ, Σσς, Ττ, Υυ, Φφ, Χχ, Ψψ, Ωω
CYRILLIC
Аа, Әә, Бб, Вв, Гг, Ғғ, Дд, Ђђ, Ѓѓ, Ее, Єє, Жж, Җҗ, Зз, Ѕѕ, Ии,
Іі, Її, Йй, Јј, Кк, Ққ, Ҝҝ, Лл, Љљ, Мм, Нн, Ңң, Њњ, Оо, Өө, Пп,
Рр, Сс, Тт, Ћћ, Ќќ, Уу, Ўў, Үү, Ұұ, Фф, Хх, Ҳҳ, Һһ, Цц, Чч, Ҹҹ,
Џџ, Шш, Щщ, Ъъ, Ыы, Ьь, Ээ, Юю, Яя
Other writing systems are included (ARABIC, HEBREW, ARMENIAN, BENGALI,
DEVANAGARI, LAO, MALAYALAM, TAMIL, TELUGU, and THAI) but they likely
require serious tailoring refinements to deal with the many underlying
languages they support. This is also the case for some CYRILLIC- or
even LATIN-based languages. Collator should gradually evolve to provide
such refinements...
7. EUROPEAN ORDERING RULES (EOR)
____________________________________________________________________________
[REF] en.wikipedia.org/wiki/European_ordering_rules
"The European ordering rules (EOR/EN 13710), define an ordering for strings
written in languages that are written with the Latin, Greek and Cyrillic
alphabets. The standard covers languages used by the European Union, the
European Free Trade Association, and parts of the former Soviet Union. It is
a tailoring of the Common Tailorable Template of ISO/IEC 14651. EOR can
in turn be tailored for different (European) languages. But in inter-
European contexts, EOR can be used without further tailoring."
REFERENCES
____________________________________________________________________________
Unicode Collation Algorithm: unicode.org/reports/tr10/
Language-Territory Information: unicode.org/cldr/charts/latest/supplemental/language_territory_information.html
Collation Charts per language: www.unicode.org/cldr/charts/28/collation/
Alphabetical Order (WP): en.wikipedia.org/wiki/Alphabetical_order
List of Latin-script letters en.wikipedia.org/wiki/List_of_Latin-script_letters
Common Locale Data Repository: cldr.unicode.org
Collation Customization: userguide.icu-project.org/collation/customization
Languages: 101languages.net/
Online Tool (ICU): demo.icu-project.org/icu-bin/collation.html
J. Tauber's Python Collator: github.com/jtauber/pyuca
Linguistic Collation (SAS) support.sas.com/resources/papers/linguistic_collation.pdf
Comparing with JS Collators dev.to/aumayeung/comparing-non-english-strings-with-javascript-collators-57bf
ISKO: Alphabetization www.isko.org/cyclo/alphabetization
*/
//==========================================================================
// NOTICE
//==========================================================================
/*
[ADD220405] The internal map `W1BA` has been added to provide easy access
to fondamental characters ('base keys') from level1 weights. This map
is not involved in sorting. Given a non-variable key whose level1 weight
(`1111 1111 1111 1xxx`) is represented by the character W="\uHHHH",
W1BA[W] returns the first* WMAP key associated to that level1 weight.
This information can be used to extract the alphabetical group of a
key, e.g. to link 'à' or 'Æ' to the base key 'A' in Latin, or 'ΰ'
to 'Υ', etc.
* By construction, the first WMAP key associated to a weight is usually
the uppercase form of a letter, so is the W1BA value in most cases.
However, the client code shouldn't assume that W1BA[W] is the upper-
case form in every Unicode area that provide alphabetic characters.
Better is to apply .toLowerCase() [resp. toUpperCase()] depending on
your requirements.
*/
//==========================================================================
// DATA
//==========================================================================
[PRIVATE]
({
YALT : $$.Yalt &&
(
$$.Yalt.addPackage
(
#include 'Collator/$$.yalt.jsxres'
)
,
$$.Yalt.addPackage
(
#include 'Collator/$$.LING.yalt.jsxres'
)
),
// Root key-to-weight map. Each weight is a string in "w1,w2,w3..." form,
// where each `wi` is the base 36 representation of a uint32 weight.
// ---
WMAP:
#include 'Collator/$$.WMAP.jsxres'
,
// Level1-to-BaseKey map. Each level1 weight (\uHHHH) is
// associated to a 'base key', typically the uppercase form of
// a fundamental alphabetic letter.
// ---
W1BA:
#include 'Collator/$$.W1BA.jsxres'
,
// Attractors (used to extract better baseKeys when possible.)
// An attractor can be supplied as 2nd arg to `baseKey()`.
// [REM] The last character must provide a codepoint that is
// just beyond the last letter.
// ---
ATTR:
{
// LATIN:
// Aa Bb Cc Dd Ee[Əə] Ff Gg Hh Ii[ı] Jj Kk Ll Mm Nn[Ŋŋ]
// Oo Pp Qq (ĸ) Rr Ss Tt[Ŧŧ] Uu Vv Ww Xx Yy Zz (Þþ)
// ---
// ATTRACTOR:
// A[Ⱥ…ꭤ] B[ʙ…Ƃ] C[Ȼ…Ꜿ] D[Ɖ…ẟ] E[ꬲ…ɤ] F[ꜰ…ꟻ] G[ɡ…Ƣ] H[ʜ…ɧ] ʻ[ʼ] I[ı…Ɩ]
// J[ȷ…ʄ] K[Ƙ…ʞ] L[ʟ…ʎ] M[ᴍ…ꝳ] N[ɴ…ꬼ] O[ꬽ…Ȣ] P[Ᵽ…ⱷ] Q[Ꝗ…Ɋ] ĸ R[ꭅ…Ꝝ] S[ꜱ…ʆ] T[Ŧ…ʇ]
// U[ꭎ…Ʊ] V[Ꝟ…Ʌ] W[Ⱳ…ʍ] X[ꭖ…ꭕ] Y[ʏ…Ȝ] Z[Ƶ…ʓ] Þ[Ꝥ…Ꝧ] \u01BF
// ---
// Absorbs: Əə, ı, Ŋŋ, ĸ, Ŧŧ
// Keeps: ĸ U+0138 and Þ U+00DE (THORN) as individual letters.
LATIN: "ABCDEFGHIJKLMNOPQĸRSTUVWXYZÞ" + "\u01BF",
// GREEK (U+0391...U+03A9)
// Αα Ββ Γγ Δδ Εε Ζζ Ηη Θθ Ιι Κκ Λλ Μμ Νν Ξξ Οο Ππ Ρρ Σσς Ττ Υυ Φφ Χχ Ψψ Ωω
// ---
// ATTRACTOR:
// Α Β Γ Δ Ε (Ϝ[Ͷ]) (Ϛ) Ζ (Ͱ) Η Θ Ι (Ϳ) Κ Λ Μ Ν
// Ξ Ο Π (Ϻ) (Ϟ[Ϙ]) Ρ[ϼ] Σ[ͼͻͽ] Τ Υ Φ Χ Ψ Ω[ꭥ] \u03E0
// ---
// Absorbs: Ͷ (PAMPHYLIAN DIGAMMA), Ϙ (ARCHAIC KOPPA), ϼ, ͼͻͽ [SIGMAs] and ꭥ
// Keeps Ϝ (DIGAMMA), Ϛ (STIGMA), Ͱ (HETA), Ϳ (YOT),
// Ϻ (SAN), Ϟ (KOPPA) as individual letters.
GREEK: "ΑΒΓΔΕϜϚΖͰΗΘΙͿΚΛΜΝΞΟΠϺϞΡΣΤΥΦΧΨΩ" + "\u03E0",
// CYRILLIC ATTRACTOR:
// Аа[Әә] Бб Вв Гг[Ғғ,Ѓѓ] Дд (Ђђ) (Ҙ) Ее[Єє] Жж[Җҗ] Зз (Ѕѕ)
// Ии[Іі] Йй (Јј) Кк[Ққ,Ҝҝ] (Ԛ) Лл[Љљ] Мм Нн[Ңң,Њњ] Оо[Өө] Пп (Ҁ)
// Рр Сс Тт (Ћћ) Уу[Ўў,Үү,Ұұ] Фф Хх[Ҳҳ] (Һһ) (Ѡ) Цц
// Чч[Ҹҹ] (Ҽ) (Ҿ) (Џџ) Шш Щщ (\uA64E) Ъъ Ыы Ьь Ээ Юю (\uA656) Яя
// ---
// - А absorbs Ә (SCHWA) since it's often assimilated to Ӓ
// https://en.wikipedia.org/wiki/Schwa_(Cyrillic)
// - Г (GHE) absorbs Ғ (GHE WITH STROKE) and variants like Ӷ;
// it also absorbs Ѓ \u0403 (GJE) analyzed as a diactrical variant
// - Д (DE) absorbs \u0500 (KOMI DE), \uA680 (DWE)
// - Ђ (DJE) absorbs \uA662 (SOFT DE), \u0502 (KOMI DJE)
// - Ҙ \u0498 (ZE WITH DESCENDER, or DHE) is unique to the Bashkir language;
// in UCA it has a specific weight < Е, hence before З (ZE, \u0417)
// . . .
// "\u0498" // 4118.1.6 Ҙ (ZE WITH DESCENDER)
// "\u0415" // 4119.1.6 Е (IE)
// . . .
// so we had to introduce this exception before Е to prevent wrong grouping.
// https://en.wikipedia.org/wiki/Bashkir_language
// - Е (IE) absorbs Є (UKRAINIAN IE)
// - Ж (ZHE) absorbs \u052A (DZZHE), \uA684 (ZHWE) and Җ (ZHE WITH DESCENDER)
// - З (ZE) absorbs \uA640 (ZEMLYA), \u0504 (KOMI ZJE), \u0510 (REVERSED ZE)
// and \uA642 (DZELO)
// - Ѕ (DZE) absorbs \uA644 (REVERSED DZE), \u04E0 (ABKHASIAN DZE),
// \uA688 (DZZE), \u0506 (KOMI DZJE) and uA682 (DZWE)
// - И (I) absorbs \u048A (SHORT I WITH TAIL), \u0406 (BYELORUSSIAN-UKRAINIAN I)
// \uA646 (IOTA) -- Note: \u048A SHORT I WITH TAIL is an exception, used only
// in Kildin Sami language (https://en.wikipedia.org/wiki/Short_I_with_tail)
// - Й (SHORT I or YOT) is a separate letter, although made of И with a breve.
// It has a distinct position in alphabet depending on the language:
// Belarusian 11th ("non-syllabic I")
// Bulgarian 10th ("short I")
// Russian 11th ("short I")
// Ukrainian 14th
// Kazakh 13th
// Cf. https://en.wikipedia.org/wiki/Short_I
// - К (KA) absorbs many diacritical variants like Қ (KA WITH DESCENDER),
// Ӄ (KA WITH HOOK) etc.
// - Ԛ (QA) is unusual but has dedicated level1 weight between К and Л.
// - Л (EL) absorbs many diacritical variants like Ӆ, Ԯ, etc, as well as
// Љ (LJE) which is a ligature used in Macedonian and Itelmen,
// and Ԕ (LHA) analyzed as a cross-digraph of Л (EL) and Х (KHA).
// - М (EM) absorbs Ӎ (EM WITH TAIL) and \uA666 (SOFT EM).
// - Н (EN) absorbs many diactrical variants (with hooks, tail, descender...)
// and \u04A4 (LIGATURE EN GHE), \u040A Њ (NJE, Macedonian), \u050A Ԋ (KOMI NJE).
// - О (\u041E) absorbs \u04E8 (BARRED O).
// - П (PE) absorbs Ԥ (PE WITH DESCENDER), Ҧ (PE WITH MIDDLE HOOK).
// - Ҁ \u0480 (KOPPA) is an archaical letter: "certain modern textbooks and
// dictionaries of Old Church Slavonic language insert this character (...)
// either between П and Р (to reproduce the Greek alphabetical order) or at
// the very end of the list." П < Ҁ < Р is adopted in the UCA.
// https://en.wikipedia.org/wiki/Koppa_(Cyrillic)
// - Р (ER) absorbs Ҏ (ER WITH TICK) and Ԗ (RHA) analyzed as a cross-digraph
// of Р and Х (was used in the Moksha language.)
// - С (ES) absorbs \u050C Ԍ (KOMI SJE) and Ҫ (ES WITH DESCENDER).
// - Т (TE) absorbs \uA68C Ꚍ (TWE), \u050E Ԏ (KOMI TJE), \u04AC Ҭ (TE WITH
// DESCENDER), \uA68A Ꚋ (TE WITH MIDDLE HOOK)
// - Ћ \u040B (TSHE) has its own level1 weight and is a separated base key
// it is used in the Serbian Cyrillic alphabet, https://en.wikipedia.org/wiki/Tshe
// - У (U) absorbs \u04AE Ү (STRAIGHT U), \u04B0 Ұ and the special UK element
// \u1C88 (SMALL LETTER UNBLENDED UK) and \u0478 (UK).
// - Х (HA) absorbs diacritical variants \u04FC Ӽ, \u04FE Ӿ and \u04B2 Ҳ
// - Һ (SHHA, or HE) has its form derived from the Latin letter H, "but the capital
// forms are more similar to a rotated Cyrillic letter Che (Ч) or a stroke-less Tshe (Ћ).
// Most of the languages using the letter call it ha - the name shha was created when
// the letter was encoded in Unicode." https://en.wikipedia.org/wiki/Shha
// It absorbs Ԧ (SHHA WITH DESCENDER) and \uA694 Ꚕ (HWE)
// - Ѡ (CYRILLIC OMEGA) was adopted into the early Cyrillic alphabet, it absorbs
// \u047E Ѿ (OT), \uA64C Ꙍ (BROAD OMEGA), \u047C Ѽ (OMEGA WITH TITLO : "beautiful omega")
// and \u047A Ѻ (ROUND OMEGA)
// - Ц (TSE) absorbs \uA660 (REVERSED TSE), \uA68E (TSWE), \u04B4 Ҵ (LIGATURE TE TSE)
// and \uA690 Ꚑ (TSSE) whose shape originated as a ligature of TE and ES (it is used in
// the Abkhaz language.)
// - Ч (CHE) absorbs \u052C Ԭ (DCHE), \uA692 Ꚓ (TCHE), \u04B6 Ҷ (CHE WITH DESCENDER),
// \u04CB Ӌ (KHAKASSIAN CHE), \u04B8 Ҹ (CHE WITH VERTICAL STROKE) and \uA686 Ꚇ (CCHE).
// - Ҽ (ABKHASIAN CHE) and its descender form \u04BE Ҿ (ABKHASIAN CHE WITH DESCENDER)
// are inserted as two separate base keys, although used only in the Abkhaz language.
// In this alphabet, Џ < Ҽ < Ҿ -- cf https://en.wikipedia.org/wiki/Abkhazian_Che --
// which violates default UCA ordering...
// - Џ (DZHE) should collate before Ҽ in Abkhaz alphabet!
// - Ш (SHA) absorbs \uA696 (SHWE)
// - Щ (SHCHA) in Russian and Ukrainian corresponds to ШЧ in related words in Belarusian.
// - \uA64E Ꙏ (NEUTRAL YER) is used "in transcribing documents when it is hard to tell
// the difference between a Ь and a Ъ. It was common in Late Medieval Russian archival
// materials and scripts." This special base key must be explicit as it collates before
// \u042A Ъ (HARD SIGN) at level 1. It then absorbs \u2E2F (VERTICAL TILDE) and
// \uA67F (PAYEROK=omitted yer)
// - Ъ (HARD SIGN) absorbs \uA650 Ꙑ (YERU WITH BACK YER)
// - Ы (YERU) is distinct from \uA650 (YERU WITH BACK YER)
// - Ь (SOFT SIGN) absorbs Ҍ (SEMISOFT SIGN) and the old-Cyrillic letter \u0462 Ѣ (YAT),
// as well as \uA652 (IOTIFIED YAT)
// - Ю (YU) absorbs \uA654 (REVERSED YU)
// - \uA656 (IOTIFIED or 'Iotated' A) Ꙗ is an archaic letter used today only in Church Slavonic.
// It is introduced w.r.t UCA between YU and YA.
// ---
CYRILLIC: "АБВГДЂҘЕЖЗЅИЙЈКԚЛМНОПҀРСТЋУФХҺѠЦЧҼҾЏШЩ\uA64EЪЫЬЭЮ\uA656Я" + "\u0464",
},
// Regex that retrieve all weighted keys from a string.
// ---
MTCH:
#include 'Collator/$$.MTCH.jsxres'
,
// String that contains all zero-weighted keys.
// ---
ZROS:
#include 'Collator/$$.ZROS.jsxres'
,
// Tailoring map. (Contains tailoring rules for 50+ languages.)
// Keys are ISO639 identifiers (cf etc/Linguist/languages) or
// specialized `zz_xyz` subkeys like `de_phone`. The unique
// exception is `EOR` (addressing European Ordering Rules.)
// ---
TLRM:
#include 'Collator/$$.TLRM.jsxres'
,
// Suffix map. Any tailor key of the form `zz_xyz` has a suffix
// `_xyz` which must be a key of ~.SUFX. The suffix map provides
// a display pattern `ptn` for that suffix (in default EN) so `zz`
// can be parsed independently as an ISO639 identifier. Also, the
// SUFX map provides a `def` property that tells (0|1) whether
// `zz_xyz` is the default tailoring key for the match `zz`.
// For example, `es_modern` is the default tailor for `es`,
// since `~.SUFX['_modern'].def` is 1.
// ---
SUFX:
#include 'Collator/$$.SUFX.jsxres'
,
// Language map. Subset (200+ items) of Linguist/LISO.
// zz => { name:str, dft:'EOR'|'ROOT', natv:str }
// When `zz` is not visible among TLRM keys, the `dft` property
// tells whether the EOR rules might be applied instead of ROOT.
// [REM] As a default mechanism EOR is automatically associated
// to Latn/Grek/Cyrl writing systems.
// ---
LING:
#include 'Collator/$$.LING.jsxres'
,
})
//==========================================================================
// KEYS/WEIGHT TOOLS
//==========================================================================
[PRIVATE]
({
SPLT: function(/*str*/s,/*bool=0*/UPD_LENGTH,/*bool=0*/SPLIT_BY_FFFD, F,a,i,ks,n)
//----------------------------------
// (Split-Into-Keys.) Split `s` into an array of measurable keys, based
// on `callee.CUR_MTCH`.
// [FIX210519] If `s` is empty, skips the process and makes ret.SIZE==0.
// [FIX200617] [CHG200618] Removes any `\0...` suffix from `s` before
// extracting keys. This both prevents `string.replace(...)` bugs and
// satisfies the rule (1) specified in `µ.sort()`.
// [CHG200616] If `SPLIT_BY_FFFD` is set, non-measurable characters are
// replaced by `\uFFFD`, remaining as a special separator having the max
// weight. But *this is no longer the default approach* as it seems more
// relevant to purely ignore non-measurable elements (in order to prevent
// issues with line terminators, ill-formed strings, etc.)
// Each elem of the returned array is a key having 1, 2, or more characters.
// (n-grams with n>2 usually appear in tailoring, cf ~.TLRM AND ~.TMAP.)
// [REM] `callee.CUR_MTCH` is either a tailored regex, or the default ~.MTCH.
// The caller is responsible to set `CUR_MTCH` as expected.
// ---
// For saving performance, the returned array is volatile and its `length`
// property *is not updated* ; use `<ret>.SIZE` instead when needed. You
// can set `UPD_LENGTH` to 1 to force the update of `<ret>.length` when
// absolutely necessary (that's more time-consuming.)
// ---
// this :: ~
// => str[]& [VOLATILE] + .SIZE
{
// Init.
// ---
F = callee[ SPLIT_BY_FFFD ? 'REPL' : 'REPL_FFFD' ]; // [CHG200616]
SPLIT_BY_FFFD && (F.OFS=0);
(a=F.RET).SIZE = 0;
// [FIX210519] Needed to support empty imput. In that case,
// skip the replace routine (would kill CS4 otherwise.)
// ---
if( s.length )
{
// [CHG200618] Removes any '\0...' suffix before extracting keys.
// ---
0 <= (i=s.indexOf('\0')) && (s=s.slice(0,i));
// Preprocessing routines (keys specified in callee.PREP.)
// ---
for( i=-1, n=(ks=callee.PREP).SIZE ; ++i < n ; s=this[ks[i]](s) );
// Trick: we use a 'fake' replacement function, its actual job is
// to digest the successive matches captured by the regex and to
// update accordingly its internal array `F.RET`. (See callee.REPL.)
// [FIX210519] Make sure `s` is *still nonempty* before replacement.
// ---
s.length && s.replace(callee.CUR_MTCH, F);
}
// Time-consuming in ExtendScript but sometimes needed.
// ---
UPD_LENGTH && a.length != a.SIZE && (a.length=a.SIZE);
return a;
}
.setup
({
REPL: function($match,$offset,_,q,z)
//----------------------------------
// Replace callback that always returns '' (important!)
// `$match` :: Current match found in the input.
// `$offset` :: Index of $match in the input, noting that the
// input is dynamically reduced, from left to right,
// as every incoming match <M> is replaced by ''
// ---
// => ''
{
return (q=callee.RET), (q[z=q.SIZE]=$match), (q.SIZE=1+z), '';
}
.setup({ RET:[] }),
REPL_FFFD: function($match,$offset,_,q,z)
//----------------------------------
// [REM200616] Old version -- no longer used by default.
// ---
// Replace callback that always returns '' (important!)
// `$match` :: Current match found in the input.
// `$offset` :: Index of $match in the input, noting that the
// input is dynamically reduced, from left to right,
// as every incoming match <M> is replaced by ''
//
// At each step `callee.OFS` indicates the *previous* offset
// (init=0.) If $offset > OFS, the OUT symbol \uFFFD is added
// in the RET array whose SIZE is incremented. Then $match is
// added, RET.SIZE is incremented, and OFS is updated to
// $offset. In the below schema `•` represents the OUT symbol.
//
// OFS 0 3 4
// input XXX<M>Y<M><M>… -> XXXY<M><M>… -> XXXY<M>…
// offset 3 4 4
// RET [] => [•,M1] => [•,M1,•,M2] => [•,M1,•,M2,M3]
// SIZE 0 => 2 => 4 => 5
// ---
// => ''
{
z = (q=callee.RET).SIZE;
callee.OFS < $offset && ( q[z++]=callee.OUT, callee.OFS=$offset );
return (q[z++]=$match), (q.SIZE=z), '';
}
.setup({ RET:[], OFS:0, OUT:String.fromCharCode(0xFFFD) }),
CUR_MTCH: µ['~'].MTCH,
// Array of preprocessing routines (~ keys.)
// ---
PREP: [].setup({ SIZE:0 }),
}),
SPLT_EXT: function(/*str*/s,/*bool=0*/UPD_LENGTH)
//----------------------------------
// (Split-External.) [ADD220328] Alias of `~.SPLT` made available to
// external context. (Could be used by callback functions.)
// ---
// this :: any
// => str[]& [VOLATILE] + .SIZE
{
return callee.µ['~'].SPLT(s, UPD_LENGTH);
},
TMAP: function(/*{key=>rule}|false*/TL, WRN,TM,TB,k,keys,op,rf,m,i,t,w,b,w1,WZ)
//----------------------------------
// (Tailor-Map.) Set the tailor map with respect to incoming rules.
// This function also updates `~.SPLT.CUR_MTCH` accordingly.
// Supply a falsy TL to restore the default map (= no tailoring.)
// ---
// [REM] `TMAP.DATA` and `~.WMAP` share the same k=>WS structure.
// TMAP.DATA is used to partially supersede WMAP assignments: if
// a KEY is found in TMAP.DATA its associated Weight String will
// be used. While TMAP.DATA is built, SPLT.CUR_MTCH is updated
// to make sure that any new key can be detected by the regex (the
// whole key must be capturable as such, taking precedence over
// substrings that ~.MTCH could detect.) Also, given a set of new
// keys k1, k2, k3..., it's important to prepend regex patterns
// from longest to shortest keys, in case `k_i` would be part of
// `k_j` (j>i). Keys are therefore reordered to guarantee that
// longest strings will be captured first while splitting an input.
// ---
// [ADD220405] `TMAP.BASE` and `~.W1BA` share the same W=>BaseKey
// structure. TMAP.BASE is used to augment or partially supersede
// W1BA assignments: if a WEIGHT is found in TMAP.BASE its asso-
// ciated Base Key will be used.
// ---
// rule :: ( `==` | `>1` | `>2` | `>3` ) + refString
// `this` :: ~
// ---
// => undef [OK] | ERR_MSG [KO]
{
// Clean up the tailor-map and restore the default (ROOT) config.
// ---
TM = callee.DATA;
for( k in TM ) delete TM[k];
this.SPLT.CUR_MTCH = this.MTCH; // So far, activate the default regex.
TB = callee.BASE; // [ADD220405] Tailored Weight1-to-BaseKey
for( k in TB ) delete TB[k];
if( !TL ) return; // Nothing to do: goes back to default.
// Init.
// ---
(WRN = callee.WARNS).length = 0;
const WM = this.WMAP;
const WB = this.W1BA;
const MAX1 = parseInt(WM['\uFFFD'],36)>>>16; // Maximal L1 weight
const CHR = String.fromCharCode;
this.SPLT.PREP.SIZE = 0;
keys = [];
for( k in TL )
{
if( !TL.hasOwnProperty(k) ) continue;
// Keys require descending sort by length.
// [REM] In principle we could optimize the final regex by ignoring
// keys that are already captured in full by ~.MTCH, but in practice
// we will prepend *every* tailoring key, even being already detected,
// as this does not dramatically increase the whole regex and likely
// speeds up the detection of initial, relevant keys.
// ---
keys.push( CHR(~k.length)+k );
// Get the operator and the reference.
// ---
op = (rf=TL[k]).slice(0,2); // `==` | `>1` | `>2` | `>3`
rf = rf.slice(2); // Any string (character, bigram or more.)
// [REM] The TM map has been initialized to `{}`. It is intended to
// provide `k => WeightString` mapping for every TL's key. When a
// rule `k •rf` is parsed (• referring to any operator), we may find
// that `rf` has been previously involved as a key: TM[rf]==WS.
// In such event, WS should take precedence over SPLT(rf) and be used
// as the reference Weight String for applying •. For example,
// `k ==rf` implies TM[k]=WS
// `k >1rf` implies TM[k]=incrementL1(WS)
// On the other hand, if TM[rf] is undefined, then rf is splitted into
// subkeys through SPLT(rf). But some of these subkeys could in turn
// already exist in TM, and their dedicated Weight Strings should then
// be used rather than the default ones.
// ---
m = TM.hasOwnProperty(rf) ? [rf] : this.SPLT(rf,1); // m :: [ "k1", "k2", ... ]
for( i=m.length ; i-- ; m[i] = TM.hasOwnProperty(t=m[i]) ? TM[t] : WM[t] ); // m :: [ "w11,w12...", "w21,w22...", ... ]
// EQUAL -> strict weight equivalence (just concatenate.)
// [REM] The key `k` being associated to the (new) weight string `m.join(',')`
// there's no need to update TB. Indeed, TM[k] will return the new weight(s)
// and ~.W1BA already associates the level1 weight to the correct Base Key.
// E.g `Y ==Z` leads to TM['Y']=WM['Z'] so Weight(Y)==Weight(Z) through TM,
// and W1BA[W] is 'Z' for the level1 weight W associated to Y.
// ---
if( '==' == op ){ TM[k]=m.join(','); continue; }
// >(1|2|3) OPERATOR
// ---
t = m.pop().split(','); // We only want to increase the weight of the *last* component.
w = parseInt(t.pop(),36)>>>0; // w :: 1111 1111 1111 1xxx 2222 222y 3333 3zzV
w1 = 0; // uint16, new level1 weight if created.
// [REM241204] `w` should be interpreted as the TRAILING value of the whole
// reference weight m :: [ "w11,w12...", "w21,w22...", ... , < t=[...,<w>] > ]
// -- last item was 'popped' from both m and t -- It is then assumed that
// increasing w (only) at level N is sufficient to satisfy `>N` operator.
// Also, the V flag of `w` will be preserved. If other mechanisms are required
// in the future, it is still possible to add new operators...
// ---
// The minimal weight in WMAP (excl. zero) is "b8y1" :
// wMin :: 0000 0000 0000 1000 0000 0010 0000 1001
// struc 1111 1111 1111 1xxx 2222 222y 3333 3zzV
// ---
switch( op )
{
case '>1':
// [FIX241204] L2L3 MINIMUM (last 16bits) : ... 0000 0010 0000 100V (keep V flag)
WZ = (1&w) ? 0x209 : 0x208;
// Increase (if possible) the L1 extra bits `xxx` : 000=>001=>010=>etc=>111
// and reset to WZ the last 16 bits.
// ---
b = w>>>16; // 1111 1111 1111 1xxx
7 == (7&b) && (WRN[WRN.length] = __("No enough LEVEL1 extra bits for the key %1. Need anyway to increment that level for tailoring. Make sure that's the expected behavior!", k.toSource()));
w1 = ++b; // [ADD220405] Store the final uint16 in w1.
if( MAX1 <= b ) return __("LEVEL1 weight limit reached (%1) for the key %2. Fix your tailoring rules.", b.toHexa('0x'), k.toSource());
// [REM241204] Make room for L2L3 bits :: `... 2222 222y 3333 3zzV`
// --- L1+ L2L3
w = ( (b<<16) | WZ );
break;
case '>2':
// [FIX241204] L3 MINIMUM (last 8bits) : ... 0000 100V (keep V flag)
WZ = (1&w) ? 0x9 : 0x8;
// Set (if possible) the level2 extra bit `y`
// and reset to WZ the last 8 bits
// ---
b = 0xFF&(w>>>8); // 2222 222y
1 == (1&b) && (WRN[WRN.length] = __("The LEVEL2 extra bit is already set for the key %1. Need anyway to increment that level for tailoring. Make sure that's the expected behavior!", k.toSource()));
++b;
if( 0xFF <= b ) return __("LEVEL2 weight limit reached (%1) for the key %2. Fix your tailoring rules.", b.toHexa('0x'), k.toSource());
// [REM241204] Make room for L3 bits :: `... 3333 3zzV`
// --- L1 L2+ L3
w = ( (0xFFFF0000&w) | (b<<8) | WZ );
break;
case '>3':
// [FIX241204] Preserve V flag.
WZ = 1&w;
// Increase (if possible) the level3 extra bits `zz`: 00=>01=>10=>11
// and KEEP the V flag.
// ---
b = 0x7F&(w>>>1); // 3333 3zz (7 bits, 2 available)
3 == (3&b) && (WRN[WRN.length] = __("No enough LEVEL3 extra bits for the key %1. Need anyway to increment that level for tailoring. Make sure that's the expected behavior!", k.toSource()));
++b;
if( 0x7F <= b ) return __("LEVEL3 weight limit reached (%1) for the key %2. Fix your tailoring rules.", b.toHexa('0x'), k.toSource());
// --- L1L2 L3+ V
w = ( (0xFFFFFF00&w) | (b<<1) | WZ );
break;
default:
return __("Wrong operator (%1).", op);
}
t.push((w>>>0).toString(36));
m.push(t.join(','));
// alert([ "k="+k, "w="+(w>>>0).toString(2), "stored: "+m.join(',') ].join('\r') );
TM[k] = m.join(','); // Add k=>WS to the map.
0 < w1 && (TB[CHR(w1)]=k); // [ADD220405] Append new level1=>k association
// E.g ...=>"ch" and ...=>"c'h" in BR tailoring.
}
// Update the matching regex in ~.SPLT.
// ---
if( i=keys.length )
{
keys.sort();
while( i-- ) keys[i] = RegExp.escape(keys[i].slice(1));
this.SPLT.CUR_MTCH = RegExp( keys.join('|') + '|' + this.MTCH.source, 'g');
}
}
.setup
({
DATA: {}, // key=>WS
BASE: {}, // W=>BaseKey [ADD220405]
WARNS: [],
}),
WG_3: function(/*str[]*/keys,/*obj*/TM,/*bool=0*/IGNORE_VARS,/*?obj*/wRemap,/*bool=0*/RV3, s1,s2,s3,n,i,k,ws,x,t,p)
//----------------------------------
// (Weight-1-to-3.) Alpha+Diacritics+Case. Get the Weight String associated
// to the array of input keys for L1-L3 comparison. Result has the form
// "<S1><0><S2><0><S3>"
// where each <S_i>::(\uHHHH)+ represents the weights at level i.
// ---
// `TM` :: Active weight map (TMAP.DATA or ~.WMAP)
// `IGNORE_VARS` :: Ignore variable elements.
// `wRemap` :: Optional object for remapping particular weight sequences
// (old_WS=>new_WS.) Used to customize variable elems.
// `RV3` :: [ADD200812] Reverse L3 weights.
// ---
// [WARNING] keys.length is not reliable, use keys.SIZE instead.