forked from jcgregorio/uri-templates
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathxml2sgml.tcl
3546 lines (2997 loc) · 105 KB
/
xml2sgml.tcl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/bin/sh
# the next line restarts using wish \
exec wish "$0" "$0" "$@"
# TODO:
# seriesInfo: RFC, I-D
# iref
#
# here begins TclXML 1.1.1
#
# earlier versions used to "package require xml 1.8", but because newer tcl
# installations have an incompatibly-verionsed sgml package, this caused
# nothing but problems. hence, we just include TclXML-1.1.1 wholesale toward
# the end of the file...
# sgml.tcl --
#
# This file provides generic parsing services for SGML-based
# languages, namely HTML and XML.
#
# NB. It is a misnomer. There is no support for parsing
# arbitrary SGML as such.
#
# Copyright (c) 1998,1999 Zveno Pty Ltd
# http://www.zveno.com/
#
# Zveno makes this software available free of charge for any purpose.
# Copies may be made of this software but all of this notice must be included
# on any copy.
#
# The software was developed for research purposes only and Zveno does not
# warrant that it is error free or fit for any purpose. Zveno disclaims any
# liability for all claims, expenses, losses, damages and costs any user may
# incur as a result of using, copying or modifying this software.
#
# Copyright (c) 1997 ANU and CSIRO on behalf of the
# participants in the CRC for Advanced Computational Systems ('ACSys').
#
# ACSys makes this software and all associated data and documentation
# ('Software') available free of charge for any purpose. You may make copies
# of the Software but you must include all of this notice on any copy.
#
# The Software was developed for research purposes and ACSys does not warrant
# that it is error free or fit for any purpose. ACSys disclaims any
# liability for all claims, expenses, losses, damages and costs any user may
# incur as a result of using, copying or modifying the Software.
#
# $Id$
package provide sgml 1.6
namespace eval sgml {
namespace export tokenise parseEvent
namespace export parseDTD
# Convenience routine
proc cl x {
return "\[$x\]"
}
# Define various regular expressions
# white space
variable Wsp " \t\r\n"
variable noWsp [cl ^$Wsp]
# Various XML names
variable nmtoken [cl -a-zA-Z0-9._]+
variable name [cl a-zA-Z_][cl -a-zA-Z0-9._]*
# Other
variable ParseEventNum
if {![info exists ParseEventNum]} {
set ParseEventNum 0
}
variable ParseDTDnum
if {![info exists ParseDTDNum]} {
set ParseDTDNum 0
}
# table of predefined entities for XML
variable EntityPredef
array set EntityPredef {
lt < gt > amp & quot \" apos '
}
}
# sgml::tokenise --
#
# Transform the given HTML/XML text into a Tcl list.
#
# Arguments:
# sgml text to tokenize
# elemExpr RE to recognise tags
# elemSub transform for matched tags
# args options
#
# Valid Options:
# -final boolean True if no more data is to be supplied
# -statevariable varName Name of a variable used to store info
#
# Results:
# Returns a Tcl list representing the document.
proc sgml::tokenise {sgml elemExpr elemSub args} {
array set options {-final 1}
catch {array set options $args}
set options(-final) [Boolean $options(-final)]
# If the data is not final then there must be a variable to store
# unused data.
if {!$options(-final) && ![info exists options(-statevariable)]} {
return -code error {option "-statevariable" required if not final}
}
# Pre-process stage
#
# Extract the internal DTD subset, if any
catch {upvar #0 $options(-internaldtdvariable) dtd}
if {[regexp {<!DOCTYPE[^[<]+\[([^]]+)\]} $sgml discard dtd]} {
regsub {(<!DOCTYPE[^[<]+)(\[[^]]+\])} $sgml {\1\&xml:intdtd;} sgml
}
# Protect Tcl special characters
regsub -all {([{}\\])} $sgml {\\\1} sgml
# Do the translation
if {[info exists options(-statevariable)]} {
upvar #0 $opts(-statevariable) unused
if {[info exists unused]} {
regsub -all $elemExpr $unused$sgml $elemSub sgml
unset unused
} else {
regsub -all $elemExpr $sgml $elemSub sgml
}
set sgml "{} {} {} {} \{$sgml\}"
# Performance note (Tcl 8.0):
# Use of lindex, lreplace will cause parsing to list object
if {[regexp {^([^<]*)(<[^>]*$)} [lindex $sgml end] x text unused]} {
set sgml [lreplace $sgml end end $text]
}
} else {
# Performance note (Tcl 8.0):
# In this case, no conversion to list object is performed
regsub -all $elemExpr $sgml $elemSub sgml
set sgml "{} {} {} {} \{$sgml\}"
}
return $sgml
}
# sgml::parseEvent --
#
# Produces an event stream for a XML/HTML document,
# given the Tcl list format returned by tokenise.
#
# This procedure checks that the document is well-formed,
# and throws an error if the document is found to be not
# well formed. Warnings are passed via the -warningcommand script.
#
# The procedure only check for well-formedness,
# no DTD is required. However, facilities are provided for entity expansion.
#
# Arguments:
# sgml Instance data, as a Tcl list.
# args option/value pairs
#
# Valid Options:
# -final Indicates end of document data
# -elementstartcommand Called when an element starts
# -elementendcommand Called when an element ends
# -characterdatacommand Called when character data occurs
# -entityreferencecommand Called when an entity reference occurs
# -processinginstructioncommand Called when a PI occurs
# -externalentityrefcommand Called for an external entity reference
#
# (Not compatible with expat)
# -xmldeclcommand Called when the XML declaration occurs
# -doctypecommand Called when the document type declaration occurs
# -commentcommand Called when a comment occurs
#
# -errorcommand Script to evaluate for a fatal error
# -warningcommand Script to evaluate for a reportable warning
# -statevariable global state variable
# -normalize whether to normalize names
# -reportempty whether to include an indication of empty elements
#
# Results:
# The various callback scripts are invoked.
# Returns empty string.
#
# BUGS:
# If command options are set to empty string then they should not be invoked.
proc sgml::parseEvent {sgml args} {
variable Wsp
variable noWsp
variable nmtoken
variable name
variable ParseEventNum
array set options [list \
-elementstartcommand [namespace current]::noop \
-elementendcommand [namespace current]::noop \
-characterdatacommand [namespace current]::noop \
-processinginstructioncommand [namespace current]::noop \
-externalentityrefcommand [namespace current]::noop \
-xmldeclcommand [namespace current]::noop \
-doctypecommand [namespace current]::noop \
-commentcommand [namespace current]::noop \
-entityreferencecommand {} \
-warningcommand [namespace current]::noop \
-errorcommand [namespace current]::Error \
-final 1 \
-emptyelement [namespace current]::EmptyElement \
-parseattributelistcommand [namespace current]::noop \
-normalize 1 \
-internaldtd {} \
-reportempty 0 \
-entityvariable [namespace current]::EntityPredef \
]
catch {array set options $args}
if {![info exists options(-statevariable)]} {
set options(-statevariable) [namespace current]::ParseEvent[incr ParseEventNum]
}
upvar #0 $options(-statevariable) state
upvar #0 $options(-entityvariable) entities
if {![info exists state]} {
# Initialise the state variable
array set state {
mode normal
haveXMLDecl 0
haveDocElement 0
context {}
stack {}
line 0
}
}
foreach {tag close empty param text} $sgml {
# Keep track of lines in the input
incr state(line) [regsub -all \n $param {} discard]
incr state(line) [regsub -all \n $text {} discard]
# If the current mode is cdata or comment then we must undo what the
# regsub has done to reconstitute the data
switch $state(mode) {
comment {
# This had "[string length $param] && " as a guard -
# can't remember why :-(
if {[regexp ([cl ^-]*)--\$ $tag discard comm1]} {
# end of comment (in tag)
set tag {}
set close {}
set empty {}
set state(mode) normal
uplevel #0 $options(-commentcommand) [list $state(commentdata)<$comm1]
unset state(commentdata)
} elseif {[regexp ([cl ^-]*)--\$ $param discard comm1]} {
# end of comment (in attributes)
uplevel #0 $options(-commentcommand) [list $state(commentdata)<$close$tag$empty>$comm1]
unset state(commentdata)
set tag {}
set param {}
set close {}
set empty {}
set state(mode) normal
} elseif {[regexp ([cl ^-]*)-->(.*) $text discard comm1 text]} {
# end of comment (in text)
uplevel #0 $options(-commentcommand) [list $state(commentdata)<$close$tag$param$empty>$comm1]
unset state(commentdata)
set tag {}
set param {}
set close {}
set empty {}
set state(mode) normal
} else {
# comment continues
append state(commentdata) <$close$tag$param$empty>$text
continue
}
}
cdata {
if {[string length $param] && [regexp ([cl ^\]]*)\]\][cl $Wsp]*\$ $tag discard cdata1]} {
# end of CDATA (in tag)
uplevel #0 $options(-characterdatacommand) [list $state(cdata)<$close$cdata1$text]
set text {}
set tag {}
unset state(cdata)
set state(mode) normal
} elseif {[regexp ([cl ^\]]*)\]\][cl $Wsp]*\$ $param discard cdata1]} {
# end of CDATA (in attributes)
uplevel #0 $options(-characterdatacommand) [list $state(cdata)<$close$tag$cdata1$text]
set text {}
set tag {}
set param {}
unset state(cdata)
set state(mode) normal
} elseif {[regexp ([cl ^\]]*)\]\][cl $Wsp]*>(.*) $text discard cdata1 text]} {
# end of CDATA (in text)
uplevel #0 $options(-characterdatacommand) [list $state(cdata)<$close$tag$param$empty>$cdata1$text]
set text {}
set tag {}
set param {}
set close {}
set empty {}
unset state(cdata)
set state(mode) normal
} else {
# CDATA continues
append state(cdata) <$close$tag$param$empty>$text
continue
}
}
}
# default: normal mode
# Bug: if the attribute list has a right angle bracket then the empty
# element marker will not be seen
set isEmpty [uplevel #0 $options(-emptyelement) [list $tag $param $empty]]
if {[llength $isEmpty]} {
foreach {empty tag param} $isEmpty break
}
switch -glob -- [string length $tag],[regexp {^\?|!.*} $tag],$close,$empty {
0,0,, {
# Ignore empty tag - dealt with non-normal mode above
}
*,0,, {
# Start tag for an element.
# Check for a right angle bracket in an attribute value
# This manifests itself by terminating the value before
# the delimiter is seen, and the delimiter appearing
# in the text
# BUG: If two or more attribute values have right angle
# brackets then this will fail on the second one.
if {[regexp [format {=[%s]*"[^"]*$} $Wsp] $param] && \
[regexp {([^"]*"[^>]*)>(.*)} $text discard attrListRemainder text]} {
append param >$attrListRemainder
} elseif {[regexp [format {=[%s]*'[^']*$} $Wsp] $param] && \
[regexp {([^']*'[^>]*)>(.*)} $text discard attrListRemainder text]} {
append param >$attrListRemainder
}
# Check if the internal DTD entity is in an attribute
# value
regsub -all &xml:intdtd\; $param \[$options(-internaldtd)\] param
ParseEvent:ElementOpen $tag $param options
set state(haveDocElement) 1
}
*,0,/, {
# End tag for an element.
ParseEvent:ElementClose $tag options
}
*,0,,/ {
# Empty element
ParseEvent:ElementOpen $tag $param options -empty 1
ParseEvent:ElementClose $tag options -empty 1
}
*,1,* {
# Processing instructions or XML declaration
switch -glob -- $tag {
{\?xml} {
# XML Declaration
if {$state(haveXMLDecl)} {
uplevel #0 $options(-errorcommand) "unexpected characters \"<$tag\" around line $state(line)"
} elseif {![regexp {\?$} $param]} {
uplevel #0 $options(-errorcommand) "XML Declaration missing characters \"?>\" around line $state(line)"
} else {
# Get the version number
if {[regexp {[ ]*version="(-+|[a-zA-Z0-9_.:]+)"[ ]*} $param discard version] || [regexp {[ ]*version='(-+|[a-zA-Z0-9_.:]+)'[ ]*} $param discard version]} {
if {[string compare $version "1.0"]} {
# Should we support future versions?
# At least 1.X?
uplevel #0 $options(-errorcommand) "document XML version \"$version\" is incompatible with XML version 1.0"
}
} else {
uplevel #0 $options(-errorcommand) "XML Declaration missing version information around line $state(line)"
}
# Get the encoding declaration
set encoding {}
regexp {[ ]*encoding="([A-Za-z]([A-Za-z0-9._]|-)*)"[ ]*} $param discard encoding
regexp {[ ]*encoding='([A-Za-z]([A-Za-z0-9._]|-)*)'[ ]*} $param discard encoding
# Get the standalone declaration
set standalone {}
regexp {[ ]*standalone="(yes|no)"[ ]*} $param discard standalone
regexp {[ ]*standalone='(yes|no)'[ ]*} $param discard standalone
# Invoke the callback
uplevel #0 $options(-xmldeclcommand) [list $version $encoding $standalone]
}
}
{\?*} {
# Processing instruction
if {![regsub {\?$} $param {} param]} {
uplevel #0 $options(-errorcommand) "PI: expected '?' character around line $state(line)"
} else {
uplevel #0 $options(-processinginstructioncommand) [list [string range $tag 1 end] [string trimleft $param]]
}
}
!DOCTYPE {
# External entity reference
# This should move into xml.tcl
# Parse the params supplied. Looking for Name, ExternalID and MarkupDecl
regexp ^[cl $Wsp]*($name)(.*) $param x state(doc_name) param
set state(doc_name) [Normalize $state(doc_name) $options(-normalize)]
set externalID {}
set pubidlit {}
set systemlit {}
set externalID {}
if {[regexp -nocase ^[cl $Wsp]*(SYSTEM|PUBLIC)(.*) $param x id param]} {
switch [string toupper $id] {
SYSTEM {
if {[regexp ^[cl $Wsp]+"([cl ^"]*)"(.*) $param x systemlit param] || [regexp ^[cl $Wsp]+'([cl ^']*)'(.*) $param x systemlit param]} {
set externalID [list SYSTEM $systemlit] ;# "
} else {
uplevel #0 $options(-errorcommand) {{syntax error: SYSTEM identifier not followed by literal}}
}
}
PUBLIC {
if {[regexp ^[cl $Wsp]+"([cl ^"]*)"(.*) $param x pubidlit param] || [regexp ^[cl $Wsp]+'([cl ^']*)'(.*) $param x pubidlit param]} {
if {[regexp ^[cl $Wsp]+"([cl ^"]*)"(.*) $param x systemlit param] || [regexp ^[cl $Wsp]+'([cl ^']*)'(.*) $param x systemlit param]} {
set externalID [list PUBLIC $pubidlit $systemlit]
} else {
uplevel #0 $options(-errorcommand) "syntax error: PUBLIC identifier not followed by system literal around line $state(line)"
}
} else {
uplevel #0 $options(-errorcommand) "syntax error: PUBLIC identifier not followed by literal around line $state(line)"
}
}
}
if {[regexp -nocase ^[cl $Wsp]+NDATA[cl $Wsp]+($name)(.*) $param x notation param]} {
lappend externalID $notation
}
}
uplevel #0 $options(-doctypecommand) $state(doc_name) [list $pubidlit $systemlit $options(-internaldtd)]
}
!--* {
# Start of a comment
# See if it ends in the same tag, otherwise change the
# parsing mode
regexp {!--(.*)} $tag discard comm1
if {[regexp ([cl ^-]*)--[cl $Wsp]*\$ $comm1 discard comm1_1]} {
# processed comment (end in tag)
uplevel #0 $options(-commentcommand) [list $comm1_1]
} elseif {[regexp ([cl ^-]*)--[cl $Wsp]*\$ $param discard comm2]} {
# processed comment (end in attributes)
uplevel #0 $options(-commentcommand) [list $comm1$comm2]
} elseif {[regexp ([cl ^-]*)-->(.*) $text discard comm2 text]} {
# processed comment (end in text)
uplevel #0 $options(-commentcommand) [list $comm1$param>$comm2]
} else {
# start of comment
set state(mode) comment
set state(commentdata) "$comm1$param>$text"
continue
}
}
{!\[CDATA\[*} {
regexp {!\[CDATA\[(.*)} $tag discard cdata1
if {[regexp {(.*)]]$} $param discard cdata2]} {
# processed CDATA (end in attribute)
uplevel #0 $options(-characterdatacommand) [list $cdata1$cdata2$text]
set text {}
} elseif {[regexp {(.*)]]>(.*)} $text discard cdata2 text]} {
# processed CDATA (end in text)
uplevel #0 $options(-characterdatacommand) [list $cdata1$param$empty>$cdata2$text]
set text {}
} else {
# start CDATA
set state(cdata) "$cdata1$param>$text"
set state(mode) cdata
continue
}
}
!ELEMENT {
# Internal DTD declaration
}
!ATTLIST {
}
!ENTITY {
}
!NOTATION {
}
!* {
uplevel #0 $options(-processinginstructioncommand) [list $tag $param]
}
default {
uplevel #0 $options(-errorcommand) [list "unknown processing instruction \"<$tag>\" around line $state(line)"]
}
}
}
*,1,* -
*,0,/,/ {
# Syntax error
uplevel #0 $options(-errorcommand) [list [list syntax error: closed/empty tag: tag $tag param $param empty $empty close $close around line $state(line)]]
}
}
# Process character data
if {$state(haveDocElement) && [llength $state(stack)]} {
# Check if the internal DTD entity is in the text
regsub -all &xml:intdtd\; $text \[$options(-internaldtd)\] text
# Look for entity references
if {([array size entities] || [string length $options(-entityreferencecommand)]) && \
[regexp {&[^;]+;} $text]} {
# protect Tcl specials
regsub -all {([][$\\])} $text {\\\1} text
# Mark entity references
regsub -all {&([^;]+);} $text [format {%s; %s {\1} ; %s %s} \}\} [namespace code [list Entity options $options(-entityreferencecommand) $options(-characterdatacommand) $options(-entityvariable)]] [list uplevel #0 $options(-characterdatacommand)] \{\{] text
set text "uplevel #0 $options(-characterdatacommand) {{$text}}"
eval $text
} else {
# Restore protected special characters
regsub -all {\\([{}\\])} $text {\1} text
uplevel #0 $options(-characterdatacommand) [list $text]
}
} elseif {[string length [string trim $text]]} {
uplevel #0 $options(-errorcommand) "unexpected text \"$text\" in document prolog around line $state(line)"
}
}
# If this is the end of the document, close all open containers
if {$options(-final) && [llength $state(stack)]} {
eval $options(-errorcommand) [list [list element [lindex $state(stack) end] remains unclosed around line $state(line)]]
}
return {}
}
# sgml::ParseEvent:ElementOpen --
#
# Start of an element.
#
# Arguments:
# tag Element name
# attr Attribute list
# opts Option variable in caller
# args further configuration options
#
# Options:
# -empty boolean
# indicates whether the element was an empty element
#
# Results:
# Modify state and invoke callback
proc sgml::ParseEvent:ElementOpen {tag attr opts args} {
upvar $opts options
upvar #0 $options(-statevariable) state
array set cfg {-empty 0}
array set cfg $args
if {$options(-normalize)} {
set tag [string toupper $tag]
}
# Update state
lappend state(stack) $tag
# Parse attribute list into a key-value representation
if {[string compare $options(-parseattributelistcommand) {}]} {
if {[catch {uplevel #0 $options(-parseattributelistcommand) [list $attr]} attr]} {
uplevel #0 $options(-errorcommand) [list $attr around line $state(line)]
set attr {}
}
}
set empty {}
if {$cfg(-empty) && $options(-reportempty)} {
set empty {-empty 1}
}
# Invoke callback
uplevel #0 $options(-elementstartcommand) [list $tag $attr] $empty
return {}
}
# sgml::ParseEvent:ElementClose --
#
# End of an element.
#
# Arguments:
# tag Element name
# opts Option variable in caller
# args further configuration options
#
# Options:
# -empty boolean
# indicates whether the element as an empty element
#
# Results:
# Modify state and invoke callback
proc sgml::ParseEvent:ElementClose {tag opts args} {
upvar $opts options
upvar #0 $options(-statevariable) state
array set cfg {-empty 0}
array set cfg $args
# WF check
if {[string compare $tag [lindex $state(stack) end]]} {
uplevel #0 $options(-errorcommand) [list "end tag \"$tag\" does not match open element \"[lindex $state(stack) end]\" around line $state(line)"]
return
}
# Update state
set state(stack) [lreplace $state(stack) end end]
set empty {}
if {$cfg(-empty) && $options(-reportempty)} {
set empty {-empty 1}
}
# Invoke callback
uplevel #0 $options(-elementendcommand) [list $tag] $empty
return {}
}
# sgml::Normalize --
#
# Perform name normalization if required
#
# Arguments:
# name name to normalize
# req normalization required
#
# Results:
# Name returned as upper-case if normalization required
proc sgml::Normalize {name req} {
if {$req} {
return [string toupper $name]
} else {
return $name
}
}
# sgml::Entity --
#
# Resolve XML entity references (syntax: &xxx;).
#
# Arguments:
# opts options array variable in caller
# entityrefcmd application callback for entity references
# pcdatacmd application callback for character data
# entities name of array containing entity definitions.
# ref entity reference (the "xxx" bit)
#
# Results:
# Returns substitution text for given entity.
proc sgml::Entity {opts entityrefcmd pcdatacmd entities ref} {
upvar 2 $opts options
upvar #0 $options(-statevariable) state
if {![string length $entities]} {
set entities [namespace current EntityPredef]
}
switch -glob -- $ref {
%* {
# Parameter entity - not recognised outside of a DTD
}
#x* {
# Character entity - hex
if {[catch {format %c [scan [string range $ref 2 end] %x tmp; set tmp]} char]} {
return -code error "malformed character entity \"$ref\""
}
uplevel #0 $pcdatacmd [list $char]
return {}
}
#* {
# Character entity - decimal
if {[catch {format %c [scan [string range $ref 1 end] %d tmp; set tmp]} char]} {
return -code error "malformed character entity \"$ref\""
}
uplevel #0 $pcdatacmd [list $char]
return {}
}
default {
# General entity
upvar #0 $entities map
if {[info exists map($ref)]} {
if {![regexp {<|&} $map($ref)]} {
# Simple text replacement - optimise
uplevel #0 $pcdatacmd [list $map($ref)]
return {}
}
# Otherwise an additional round of parsing is required.
# This only applies to XML, since HTML doesn't have general entities
# Must parse the replacement text for start & end tags, etc
# This text must be self-contained: balanced closing tags, and so on
set tokenised [tokenise $map($ref) $::xml::tokExpr $::xml::substExpr]
set final $options(-final)
unset options(-final)
eval parseEvent [list $tokenised] [array get options] -final 0
set options(-final) $final
return {}
} elseif {[string length $entityrefcmd]} {
uplevel #0 $entityrefcmd [list $ref]
return {}
}
}
}
# If all else fails leave the entity reference untouched
uplevel #0 $pcdatacmd [list &$ref\;]
return {}
}
####################################
#
# DTD parser for SGML (XML).
#
# This DTD actually only handles XML DTDs. Other language's
# DTD's, such as HTML, must be written in terms of a XML DTD.
#
# A DTD is represented as a three element Tcl list.
# The first element contains the content models for elements,
# the second contains the attribute lists for elements and
# the last element contains the entities for the document.
#
####################################
# sgml::parseDTD --
#
# Entry point to the SGML DTD parser.
#
# Arguments:
# dtd data defining the DTD to be parsed
# args configuration options
#
# Results:
# Returns a three element list, first element is the content model
# for each element, second element are the attribute lists of the
# elements and the third element is the entity map.
proc sgml::parseDTD {dtd args} {
variable Wsp
variable ParseDTDnum
array set opts [list \
-errorcommand [namespace current]::noop \
state [namespace current]::parseDTD[incr ParseDTDnum]
]
array set opts $args
set exp <!([cl ^$Wsp>]+)[cl $Wsp]*([cl ^$Wsp]+)[cl $Wsp]*([cl ^>]*)>
set sub {{\1} {\2} {\3} }
regsub -all $exp $dtd $sub dtd
foreach {decl id value} $dtd {
catch {DTD:[string toupper $decl] $id $value} err
}
return [list [array get contentmodel] [array get attributes] [array get entities]]
}
# Procedures for handling the various declarative elements in a DTD.
# New elements may be added by creating a procedure of the form
# parse:DTD:_element_
# For each of these procedures, the various regular expressions they use
# are created outside of the proc to avoid overhead at runtime
# sgml::DTD:ELEMENT --
#
# <!ELEMENT ...> defines an element.
#
# The content model for the element is stored in the contentmodel array,
# indexed by the element name. The content model is parsed into the
# following list form:
#
# {} Content model is EMPTY.
# Indicated by an empty list.
# * Content model is ANY.
# Indicated by an asterix.
# {ELEMENT ...}
# Content model is element-only.
# {MIXED {element1 element2 ...}}
# Content model is mixed (PCDATA and elements).
# The second element of the list contains the
# elements that may occur. #PCDATA is assumed
# (ie. the list is normalised).
#
# Arguments:
# id identifier for the element.
# value other information in the PI
proc sgml::DTD:ELEMENT {id value} {
dbgputs DTD_parse [list DTD:ELEMENT $id $value]
variable Wsp
upvar opts state
upvar contentmodel cm
if {[info exists cm($id)]} {
eval $state(-errorcommand) element [list "element \"$id\" already declared"]
} else {
switch -- $value {
EMPTY {
set cm($id) {}
}
ANY {
set cm($id) *
}
default {
if {[regexp [format {^\([%s]*#PCDATA[%s]*(\|([^)]+))?[%s]*\)*[%s]*$} $Wsp $Wsp $Wsp $Wsp] discard discard mtoks]} {
set cm($id) [list MIXED [split $mtoks |]]
} else {
if {[catch {CModelParse $state(state) $value} result]} {
eval $state(-errorcommand) element [list $result]
} else {
set cm($id) [list ELEMENT $result]
}
}
}
}
}
}
# sgml::CModelParse --
#
# Parse an element content model (non-mixed).
# A syntax tree is constructed.
# A transition table is built next.
#
# This is going to need alot of work!
#
# Arguments:
# state state array variable
# value the content model data
#
# Results:
# A Tcl list representing the content model.
proc sgml::CModelParse {state value} {
upvar #0 $state var
# First build syntax tree
set syntaxTree [CModelMakeSyntaxTree $state $value]
# Build transition table
set transitionTable [CModelMakeTransitionTable $state $syntaxTree]
return [list $syntaxTree $transitionTable]
}
# sgml::CModelMakeSyntaxTree --
#
# Construct a syntax tree for the regular expression.
#
# Syntax tree is represented as a Tcl list:
# rep {:choice|:seq {{rep list1} {rep list2} ...}}
# where: rep is repetition character, *, + or ?. {} for no repetition
# listN is nested expression or Name
#
# Arguments:
# spec Element specification
#
# Results:
# Syntax tree for element spec as nested Tcl list.
#
# Examples:
# (memo)
# {} {:seq {{} memo}}
# (front, body, back?)
# {} {:seq {{} front} {{} body} {? back}}
# (head, (p | list | note)*, div2*)
# {} {:seq {{} head} {* {:choice {{} p} {{} list} {{} note}}} {* div2}}
# (p | a | ul)+
# + {:choice {{} p} {{} a} {{} ul}}
proc sgml::CModelMakeSyntaxTree {state spec} {
upvar #0 $state var
variable Wsp
variable name
# Translate the spec into a Tcl list.
# None of the Tcl special characters are allowed in a content model spec.
if {[regexp {\$|\[|\]|\{|\}} $spec]} {
return -code error "illegal characters in specification"
}
regsub -all [format {(%s)[%s]*(\?|\*|\+)?[%s]*(,|\|)?} $name $Wsp $Wsp] $spec [format {%sCModelSTname %s {\1} {\2} {\3}} \n $state] spec
regsub -all {\(} $spec "\nCModelSTopenParen $state " spec
regsub -all [format {\)[%s]*(\?|\*|\+)?[%s]*(,|\|)?} $Wsp $Wsp] $spec [format {%sCModelSTcloseParen %s {\1} {\2}} \n $state] spec
array set var {stack {} state start}
eval $spec
# Peel off the outer seq, its redundant
return [lindex [lindex $var(stack) 1] 0]
}
# sgml::CModelSTname --
#
# Processes a name in a content model spec.
#
# Arguments:
# state state array variable
# name name specified
# rep repetition operator
# cs choice or sequence delimiter
#
# Results:
# See CModelSTcp.
proc sgml::CModelSTname {state name rep cs args} {
if {[llength $args]} {
return -code error "syntax error in specification: \"$args\""
}
CModelSTcp $state $name $rep $cs
}