Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse complexContent with extension element #631

Merged
merged 2 commits into from
Feb 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 95 additions & 67 deletions src/main/scala/com/databricks/spark/xml/util/XSDToSchema.scala
Original file line number Diff line number Diff line change
Expand Up @@ -138,74 +138,30 @@ object XSDToSchema {
case unsupported =>
throw new IllegalArgumentException(s"Unsupported content: $unsupported")
}
case content: XmlSchemaComplexContent =>
val complexContent = content.getContent
complexContent match {
case extension: XmlSchemaComplexContentExtension =>
val baseStructField = getStructField(xmlSchema,
xmlSchema.getParent.getTypeByQName(extension.getBaseTypeName))
val baseFields = baseStructField.dataType match {
case structType: StructType => structType.fields
case others =>
throw new IllegalArgumentException(
s"Non-StructType in ComplexContentExtension: $others"
)
}

val extendedFields = getStructFieldsFromParticle(extension.getParticle, xmlSchema)
StructField(
schemaType.getQName.getLocalPart,
StructType(baseFields ++ extendedFields)
)
case unsupported =>
throw new IllegalArgumentException(s"Unsupported content: $unsupported")
}
case null =>
val childFields =
complexType.getParticle match {
// xs:all
case all: XmlSchemaAll =>
all.getItems.asScala.map {
case element: XmlSchemaElement =>
val baseStructField = getStructField(xmlSchema, element.getSchemaType)
val nullable = element.getMinOccurs == 0
if (element.getMaxOccurs == 1) {
StructField(element.getName, baseStructField.dataType, nullable)
} else {
StructField(element.getName, ArrayType(baseStructField.dataType), nullable)
}
}.toSeq
// xs:choice
case choice: XmlSchemaChoice =>
choice.getItems.asScala.map {
case element: XmlSchemaElement =>
val baseStructField = getStructField(xmlSchema, element.getSchemaType)
if (element.getMaxOccurs == 1) {
StructField(element.getName, baseStructField.dataType, true)
} else {
StructField(element.getName, ArrayType(baseStructField.dataType), true)
}
case any: XmlSchemaAny =>
val dataType = if (any.getMaxOccurs > 1) ArrayType(StringType) else StringType
StructField(XmlOptions.DEFAULT_WILDCARD_COL_NAME, dataType, true)
}.toSeq
// xs:sequence
case sequence: XmlSchemaSequence =>
// flatten xs:choice nodes
sequence.getItems.asScala.flatMap { _ match {
case choice: XmlSchemaChoice =>
choice.getItems.asScala.map { e =>
val xme = e.asInstanceOf[XmlSchemaElement]
val baseType = getStructField(xmlSchema, xme.getSchemaType).dataType
val dataType = if (xme.getMaxOccurs > 1) ArrayType(baseType) else baseType
StructField(xme.getName, dataType, true)
}
case e: XmlSchemaElement =>
val refQName = e.getRef.getTargetQName
val baseType =
if (refQName != null) {
getStructField(
xmlSchema,
xmlSchema.getParent.getElementByQName(refQName).getSchemaType).dataType
}
else getStructField(xmlSchema, e.getSchemaType).dataType
val dataType = if (e.getMaxOccurs > 1) ArrayType(baseType) else baseType
val nullable = e.getMinOccurs == 0
val structFieldName =
Option(refQName).map(_.getLocalPart).getOrElse(e.getName)
Seq(StructField(structFieldName, dataType, nullable))
case any: XmlSchemaAny =>
val dataType =
if (any.getMaxOccurs > 1) ArrayType(StringType) else StringType
val nullable = any.getMinOccurs == 0
Seq(StructField(XmlOptions.DEFAULT_WILDCARD_COL_NAME, dataType, nullable))
case unsupported =>
throw new IllegalArgumentException(s"Unsupported item: $unsupported")
}
}.toSeq
case null =>
Seq.empty
case unsupported =>
throw new IllegalArgumentException(s"Unsupported particle: $unsupported")
}
val childFields = getStructFieldsFromParticle(complexType.getParticle, xmlSchema)
val attributes = complexType.getAttributes.asScala.map {
case attribute: XmlSchemaAttribute =>
val attributeType = attribute.getSchemaTypeName match {
Expand Down Expand Up @@ -237,4 +193,76 @@ object XSDToSchema {
})
}

private def getStructFieldsFromParticle(
particle: XmlSchemaParticle,
xmlSchema: XmlSchema
): Seq[StructField] = {
particle match {
// xs:all
case all: XmlSchemaAll =>
all.getItems.asScala.map {
case element: XmlSchemaElement =>
val baseStructField = getStructField(xmlSchema, element.getSchemaType)
val nullable = element.getMinOccurs == 0
if (element.getMaxOccurs == 1) {
StructField(element.getName, baseStructField.dataType, nullable)
} else {
StructField(element.getName, ArrayType(baseStructField.dataType), nullable)
}
}.toSeq
// xs:choice
case choice: XmlSchemaChoice =>
choice.getItems.asScala.map {
case element: XmlSchemaElement =>
val baseStructField = getStructField(xmlSchema, element.getSchemaType)
if (element.getMaxOccurs == 1) {
StructField(element.getName, baseStructField.dataType, true)
} else {
StructField(element.getName, ArrayType(baseStructField.dataType), true)
}
case any: XmlSchemaAny =>
val dataType = if (any.getMaxOccurs > 1) ArrayType(StringType) else StringType
StructField(XmlOptions.DEFAULT_WILDCARD_COL_NAME, dataType, true)
}.toSeq
// xs:sequence
case sequence: XmlSchemaSequence =>
// flatten xs:choice nodes
sequence.getItems.asScala.flatMap {
_ match {
case choice: XmlSchemaChoice =>
choice.getItems.asScala.map { e =>
val xme = e.asInstanceOf[XmlSchemaElement]
val baseType = getStructField(xmlSchema, xme.getSchemaType).dataType
val dataType = if (xme.getMaxOccurs > 1) ArrayType(baseType) else baseType
StructField(xme.getName, dataType, true)
}
case e: XmlSchemaElement =>
val refQName = e.getRef.getTargetQName
val baseType =
if (refQName != null) {
getStructField(
xmlSchema,
xmlSchema.getParent.getElementByQName(refQName).getSchemaType).dataType
}
else getStructField(xmlSchema, e.getSchemaType).dataType
val dataType = if (e.getMaxOccurs > 1) ArrayType(baseType) else baseType
val nullable = e.getMinOccurs == 0
val structFieldName =
Option(refQName).map(_.getLocalPart).getOrElse(e.getName)
Seq(StructField(structFieldName, dataType, nullable))
case any: XmlSchemaAny =>
val dataType =
if (any.getMaxOccurs > 1) ArrayType(StringType) else StringType
val nullable = any.getMinOccurs == 0
Seq(StructField(XmlOptions.DEFAULT_WILDCARD_COL_NAME, dataType, nullable))
case unsupported =>
throw new IllegalArgumentException(s"Unsupported item: $unsupported")
}
}.toSeq
case null =>
Seq.empty
case unsupported =>
throw new IllegalArgumentException(s"Unsupported particle: $unsupported")
}
}
}
25 changes: 25 additions & 0 deletions src/test/resources/complex-content-extension.xsd
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<?xml version="1.0"?>
<xs:schema xmlns:xs="http://www.w3.org/2001/XMLSchema">

<xs:element name="employee" type="fullpersoninfo"/>

<xs:complexType name="personinfo">
<xs:sequence>
<xs:element name="firstname" type="xs:string"/>
<xs:element name="lastname" type="xs:string"/>
</xs:sequence>
</xs:complexType>

<xs:complexType name="fullpersoninfo">
<xs:complexContent>
<xs:extension base="personinfo">
<xs:sequence>
<xs:element name="address" type="xs:string"/>
<xs:element name="city" type="xs:string"/>
<xs:element name="country" type="xs:string"/>
</xs:sequence>
</xs:extension>
</xs:complexContent>
</xs:complexType>

</xs:schema>
Original file line number Diff line number Diff line change
Expand Up @@ -152,4 +152,23 @@ class XSDToSchemaSuite extends AnyFunSuite {
)
assert(parsedSchema === expectedSchema)
}

test("Test complex content with extension element / Issue 554") {
val parsedSchema = XSDToSchema.read(Paths.get(s"$resDir/complex-content-extension.xsd"))

val expectedSchema = buildSchema(
field(
"employee",
struct(
field("firstname", StringType, false),
field("lastname", StringType, false),
field("address", StringType, false),
field("city", StringType, false),
field("country", StringType, false)
),
false
)
)
assert(parsedSchema === expectedSchema)
}
}