From 926132a2d3d75f18ae159999656bec9ea7f0f125 Mon Sep 17 00:00:00 2001 From: "eliot.kimber" Date: Sun, 9 Jun 2024 10:04:31 -0500 Subject: [PATCH 1/4] Fixes #133, #105: Set compatibity mode setting to turn off compatibility mode. Signed-off-by: eliot.kimber --- .../xml2docx/generator/DocxGenerator.java | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/wordinator/xml2docx/generator/DocxGenerator.java b/src/main/java/org/wordinator/xml2docx/generator/DocxGenerator.java index eb94bbf..fb617da 100644 --- a/src/main/java/org/wordinator/xml2docx/generator/DocxGenerator.java +++ b/src/main/java/org/wordinator/xml2docx/generator/DocxGenerator.java @@ -75,6 +75,7 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBookmark; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTBorder; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTCompat; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTCompatSetting; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDecimalNumber; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTDocument1; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFldChar; @@ -451,7 +452,7 @@ private void constructDoc(XWPFDocument doc, XmlObject xml) throws DocxGeneration cursor.pop(); cursor.push(); cursor.toChild(new QName(DocxConstants.SIMPLE_WP_NS, "body")); - + setDocSettings(doc, xml); handleBody(doc, cursor.getObject()); cursor.pop(); @@ -462,7 +463,6 @@ private void constructDoc(XWPFDocument doc, XmlObject xml) throws DocxGeneration setupPageSequence(doc, cursor.getObject()); } else { CTDocument1 document = doc.getDocument(); - setDocSettings(doc, xml); CTBody body = (document.isSetBody() ? document.getBody() : document.addNewBody()); @SuppressWarnings("unused") CTSectPr sectPr = (body.isSetSectPr() ? body.getSectPr() : body.addNewSectPr()); @@ -479,12 +479,17 @@ private void constructDoc(XWPFDocument doc, XmlObject xml) throws DocxGeneration * @param xml Simple ML doc */ private void setDocSettings(XWPFDocument doc, XmlObject xml) { + // Issue #133: Turn off compatibility mode. XWPFSettings settings = doc.getSettings(); CTSettings ctSettings = settings.getCTSettings(); CTCompat compat = ctSettings.addNewCompat(); // This may be all we need to do. - - + CTCompatSetting compatSetting = compat.addNewCompatSetting(); + // Name, URI, and value come from inspecting working Word docs. + // I do not know where these values are documented. + compatSetting.setName("compatibilityMode"); + compatSetting.setUri("http://schemas.microsoft.com/office/word"); + compatSetting.setVal("15"); } /** From a3903f614caebc63004f24dc5a1eb5b5a9294769 Mon Sep 17 00:00:00 2001 From: "eliot.kimber" Date: Sun, 4 Aug 2024 15:14:36 -0500 Subject: [PATCH 2/4] Fixes #109: Incorporate fix from Lars Marius to ensure table cell ends with

Signed-off-by: eliot.kimber --- .../xml2docx/generator/DocxGenerator.java | 98 +++++++++++-------- .../xml2docx/TestDocxGenerator.java | 67 ++++++++++++- .../simplewp/simplewpml-table-empty-cell.swpx | 23 +++++ .../simplewp/simplewpml-table-nested-03.swpx | 27 +++++ 4 files changed, 172 insertions(+), 43 deletions(-) create mode 100644 src/test/resources/simplewp/simplewpml-table-empty-cell.swpx create mode 100644 src/test/resources/simplewp/simplewpml-table-nested-03.swpx diff --git a/src/main/java/org/wordinator/xml2docx/generator/DocxGenerator.java b/src/main/java/org/wordinator/xml2docx/generator/DocxGenerator.java index fb617da..088d13d 100644 --- a/src/main/java/org/wordinator/xml2docx/generator/DocxGenerator.java +++ b/src/main/java/org/wordinator/xml2docx/generator/DocxGenerator.java @@ -3278,49 +3278,63 @@ private XWPFTableRow makeTableRow( // Issue 134: If is empty, hasMore will be false. if (!hasMore) { // Leave the empty paragraph, which is required by Word. + } else { - // Cells always have at least one paragraph. - cell.removeParagraph(0); - - while (hasMore) { - if (cursor.getName().equals(DocxConstants.QNAME_P_ELEM)) { - XWPFParagraph p = cell.addParagraph(); - makeParagraph(p, cursor); - if (null != align) { - if ("JUSTIFY".equalsIgnoreCase(align)) { - // Issue 18: "BOTH" is the better match to "JUSTIFY" - align = "BOTH"; // Slight mistmatch between markup and model - } - if ("CHAR".equalsIgnoreCase(align)) { - // I'm not sure this is the best mapping but it seemed close enough - align = "NUM_TAB"; // Slight mistmatch between markup and model - } - ParagraphAlignment alignment = ParagraphAlignment.valueOf(align.toUpperCase()); - p.setAlignment(alignment); - } - } else if (cursor.getName().equals(DocxConstants.QNAME_TABLE_ELEM)) { - // record how many tables were in the cell previously - int preTables = cell.getCTTc().getTblList().size(); - - CTTbl ctTbl = cell.getCTTc().addNewTbl(); - ctTbl = cell.getCTTc().addNewTbl(); - CTTblPr tblPr = ctTbl.addNewTblPr(); - tblPr.addNewTblW(); - - XWPFTable nestedTable = new XWPFTable(ctTbl, cell); - makeTable(nestedTable, cursor.getObject()); - - // for some reason this inserts two tables, where the - // first one is empty. we need to remove that one. - // luckily, the number of tables we used to have equals - // the index of the first new table - cell.getCTTc().removeTbl(preTables); - } else { - log.warn("Table cell contains unknown element {} -- skipping", cursor.getName()); - } - - hasMore = cursor.toNextSibling(); - } + // Cells always have at least one paragraph. + cell.removeParagraph(0); + + // the cell has to *end* with a paragraph, so if the last block isn't + // a paragraph we need to add one at the end. using this to track + boolean lastIsParagraph = false; + + // convert the contents of the cell + while (hasMore) { + if (cursor.getName().equals(DocxConstants.QNAME_P_ELEM)) { + lastIsParagraph = true; + XWPFParagraph p = cell.addParagraph(); + makeParagraph(p, cursor); + if (null != align) { + if ("JUSTIFY".equalsIgnoreCase(align)) { + // Issue 18: "BOTH" is the better match to "JUSTIFY" + align = "BOTH"; // Slight mistmatch between markup and model + } + if ("CHAR".equalsIgnoreCase(align)) { + // I'm not sure this is the best mapping but it seemed close enough + align = "NUM_TAB"; // Slight mistmatch between markup and model + } + ParagraphAlignment alignment = ParagraphAlignment.valueOf(align.toUpperCase()); + p.setAlignment(alignment); + } + } else if (cursor.getName().equals(DocxConstants.QNAME_TABLE_ELEM)) { + lastIsParagraph = false; + + // record how many tables were in the cell previously + int preTables = cell.getCTTc().getTblList().size(); + + CTTbl ctTbl = cell.getCTTc().addNewTbl(); + ctTbl = cell.getCTTc().addNewTbl(); + CTTblPr tblPr = ctTbl.addNewTblPr(); + tblPr.addNewTblW(); + + XWPFTable nestedTable = new XWPFTable(ctTbl, cell); + makeTable(nestedTable, cursor.getObject()); + + // for some reason this inserts two tables, where the + // first one is empty. we need to remove that one. + // luckily, the number of tables we used to have equals + // the index of the first new table + cell.getCTTc().removeTbl(preTables); + } else { + log.warn("Table cell contains unknown element {} -- skipping", cursor.getName()); + } + + hasMore = cursor.toNextSibling(); + } + + // cell didn't end in a paragraph, so need to add one + if (!lastIsParagraph) { + cell.addParagraph(); + } } } cursor.pop(); diff --git a/src/test/java/org/wordinator/xml2docx/TestDocxGenerator.java b/src/test/java/org/wordinator/xml2docx/TestDocxGenerator.java index beea0d8..81b0735 100644 --- a/src/test/java/org/wordinator/xml2docx/TestDocxGenerator.java +++ b/src/test/java/org/wordinator/xml2docx/TestDocxGenerator.java @@ -705,13 +705,16 @@ public void testNestedTable() throws Exception { XWPFTableCell cell = row.getCell(0); contents = cell.getBodyElements(); - assertEquals(1, contents.size()); + assertEquals(2, contents.size()); it = contents.iterator(); elem = it.next(); assertEquals(BodyElementType.TABLE, elem.getElementType()); t = (XWPFTable) elem; assertEquals(2, t.getNumberOfRows()); + + elem = it.next(); + assertEquals(BodyElementType.PARAGRAPH, elem.getElementType()); } @Test @@ -759,6 +762,68 @@ public void testKeepLines() throws Exception { assertEquals("Expected on", value, "on"); } + @Test + public void testTableEmptyCell() throws Exception { + XWPFDocument doc = convert("simplewp/simplewpml-table-empty-cell.swpx", "out/table-empty-cell.docx"); + + List contents = doc.getBodyElements(); + assertEquals(1, contents.size()); + + Iterator it = contents.iterator(); + IBodyElement elem = it.next(); + assertEquals(BodyElementType.TABLE, elem.getElementType()); + + XWPFTable t = (XWPFTable) elem; + assertEquals(1, t.getNumberOfRows()); + + XWPFTableRow row = t.getRow(0); + assertEquals(2, row.getTableCells().size()); + + XWPFTableCell cell = row.getCell(0); + contents = cell.getBodyElements(); + assertEquals(1, contents.size()); + + cell = row.getCell(1); // the empty cell + contents = cell.getBodyElements(); + assertEquals(1, contents.size()); // used to fail with 0 + } + + @Test + public void testNestedTableParaBeforeTable() throws Exception { + XWPFDocument doc = convert("simplewp/simplewpml-table-nested-03.swpx", "out/table-nested-01.docx"); + + List contents = doc.getBodyElements(); + assertEquals(1, contents.size()); + + Iterator it = contents.iterator(); + IBodyElement elem = it.next(); + assertEquals(BodyElementType.TABLE, elem.getElementType()); + + XWPFTable t = (XWPFTable) elem; + assertEquals(1, t.getNumberOfRows()); + + XWPFTableRow row = t.getRow(0); + assertEquals(1, row.getTableCells().size()); + + XWPFTableCell cell = row.getCell(0); + contents = cell.getBodyElements(); + assertEquals(3, contents.size()); + + it = contents.iterator(); + elem = it.next(); + + assertEquals(BodyElementType.PARAGRAPH, elem.getElementType()); + assertEquals("NOTE", ((XWPFParagraph) elem).getText()); + + elem = it.next(); + assertEquals(BodyElementType.TABLE, elem.getElementType()); + t = (XWPFTable) elem; + assertEquals(1, t.getNumberOfRows()); + + elem = it.next(); + assertEquals(BodyElementType.PARAGRAPH, elem.getElementType()); + } + // ===== INTERNAL UTILITIES private XWPFDocument convert(String infile, String outfile) throws Exception { diff --git a/src/test/resources/simplewp/simplewpml-table-empty-cell.swpx b/src/test/resources/simplewp/simplewpml-table-empty-cell.swpx new file mode 100644 index 0000000..7f68ad4 --- /dev/null +++ b/src/test/resources/simplewp/simplewpml-table-empty-cell.swpx @@ -0,0 +1,23 @@ + + + + + + + + + + + + + A + + + + + + + + + + diff --git a/src/test/resources/simplewp/simplewpml-table-nested-03.swpx b/src/test/resources/simplewp/simplewpml-table-nested-03.swpx new file mode 100644 index 0000000..b110bba --- /dev/null +++ b/src/test/resources/simplewp/simplewpml-table-nested-03.swpx @@ -0,0 +1,27 @@ + + + + + + + + + NOTE + + + + + + + blabla + + + + + + + + + + + From f01dfe115704cd22302f1d79daff2fd7b463f07f Mon Sep 17 00:00:00 2001 From: "eliot.kimber" Date: Sun, 4 Aug 2024 15:44:01 -0500 Subject: [PATCH 3/4] WIP: Added multi-section test cases from Lars Marius Signed-off-by: eliot.kimber --- .../xml2docx/TestDocxGenerator.java | 44 +++++++++++++++++-- .../simplewp/simplewpml-multisection-01.swpx | 37 ++++++++++++++++ 2 files changed, 78 insertions(+), 3 deletions(-) create mode 100644 src/test/resources/simplewp/simplewpml-multisection-01.swpx diff --git a/src/test/java/org/wordinator/xml2docx/TestDocxGenerator.java b/src/test/java/org/wordinator/xml2docx/TestDocxGenerator.java index 81b0735..189d052 100644 --- a/src/test/java/org/wordinator/xml2docx/TestDocxGenerator.java +++ b/src/test/java/org/wordinator/xml2docx/TestDocxGenerator.java @@ -39,11 +39,14 @@ import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTFldChar; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTOnOff; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPageMar; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPageNumber; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTPageSz; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTR; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTSectPr; import org.openxmlformats.schemas.wordprocessingml.x2006.main.CTText; import org.openxmlformats.schemas.wordprocessingml.x2006.main.STFldCharType; -// import org.openxmlformats.schemas.wordprocessingml.x2006.main.STOnOff; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.STNumberFormat; +import org.openxmlformats.schemas.wordprocessingml.x2006.main.STPageOrientation; import org.wordinator.xml2docx.generator.DocxConstants; import org.wordinator.xml2docx.generator.DocxGenerator; @@ -123,8 +126,8 @@ public void testMakeDocxWithSections() throws Exception { CTSectPr docSectPr = doc.getDocument().getBody().getSectPr(); assertNotNull("Expected to find a docSectPr element", docSectPr); - assertEquals("Expected 3 headers", 3, docSectPr.getHeaderReferenceList().size()); - assertEquals("Expected 3 footers", 3, docSectPr.getFooterReferenceList().size()); + assertEquals("Expected 6 headers", 6, docSectPr.getHeaderReferenceList().size()); + assertEquals("Expected 6 footers", 6, docSectPr.getFooterReferenceList().size()); // Document-level headers and footers: XWPFHeaderFooterPolicy hfPolicy = doc.getHeaderFooterPolicy(); @@ -824,6 +827,41 @@ public void testNestedTableParaBeforeTable() throws Exception { assertEquals(BodyElementType.PARAGRAPH, elem.getElementType()); } + @Test + public void testMultiSectionPageProps() throws Exception { + // verifies the solution to issues #68 and #117 + XWPFDocument doc = convert("simplewp/simplewpml-multisection-01.swpx", "out/multisection-01.docx"); + + List contents = doc.getBodyElements(); + assertEquals(2, contents.size()); + + Iterator it = contents.iterator(); + + IBodyElement elem = it.next(); + assertEquals(BodyElementType.PARAGRAPH, elem.getElementType()); + XWPFParagraph p = (XWPFParagraph) elem; + assertEquals("This is the first page numbered in Roman lower-case", p.getText()); + assertTrue("first para lacks section properties", p.getCTPPr().isSetSectPr()); + + elem = it.next(); + assertEquals(BodyElementType.PARAGRAPH, elem.getElementType()); + p = (XWPFParagraph) elem; + assertEquals("This is the first page numbered in decimal", p.getText()); + assertFalse("second para has section properties", p.getCTPPr().isSetSectPr()); + + CTSectPr sectPr = doc.getDocument().getBody().getSectPr(); + CTPageNumber pgNum = sectPr.getPgNumType(); + assertEquals(BigInteger.valueOf(1), pgNum.getStart()); + assertEquals(STNumberFormat.Enum.forString("decimal"), pgNum.getFmt()); + + // FIXME: check header & footer (a bit tricky) + + CTPageSz pageSz = sectPr.getPgSz(); + assertEquals(STPageOrientation.Enum.forString("portrait"), pageSz.getOrient()); + assertEquals(BigInteger.valueOf(11906), pageSz.getW()); + assertEquals(BigInteger.valueOf(16838), pageSz.getH()); + } + // ===== INTERNAL UTILITIES private XWPFDocument convert(String infile, String outfile) throws Exception { diff --git a/src/test/resources/simplewp/simplewpml-multisection-01.swpx b/src/test/resources/simplewp/simplewpml-multisection-01.swpx new file mode 100644 index 0000000..b3e985a --- /dev/null +++ b/src/test/resources/simplewp/simplewpml-multisection-01.swpx @@ -0,0 +1,37 @@ + + + + + + + + + + + + + + + + + This is the first page numbered in Roman lower-case + + + + + + + + + + + + + + + + This is the first page numbered in decimal + + + + From 4f8790c7f0b2bbf6f3b45f15d4a294bae49b64a0 Mon Sep 17 00:00:00 2001 From: "eliot.kimber" Date: Sun, 4 Aug 2024 15:59:49 -0500 Subject: [PATCH 4/4] Fixes #117: Last section handling from Lars Marius Signed-off-by: eliot.kimber --- .../xml2docx/generator/DocxConstants.java | 2 + .../xml2docx/generator/DocxGenerator.java | 74 ++++++++++++++++++- 2 files changed, 73 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/wordinator/xml2docx/generator/DocxConstants.java b/src/main/java/org/wordinator/xml2docx/generator/DocxConstants.java index daa4f8d..3d6d069 100644 --- a/src/main/java/org/wordinator/xml2docx/generator/DocxConstants.java +++ b/src/main/java/org/wordinator/xml2docx/generator/DocxConstants.java @@ -123,6 +123,7 @@ public final class DocxConstants { public static final QName QNAME_XSLT_FORMAT_ATT = new QName("", "xslt-format"); // Elements: + public static final QName QNAME_BODY_ELEM = new QName(SIMPLE_WP_NS, "body"); public static final QName QNAME_COLS_ELEM = new QName(SIMPLE_WP_NS, "cols"); public static final QName QNAME_COL_ELEM = new QName(SIMPLE_WP_NS, "col"); public static final QName QNAME_CORE_PROPERTIES_ELEM = new QName(SIMPLE_WP_NS, "core-properties"); @@ -137,6 +138,7 @@ public final class DocxConstants { public static final QName QNAME_W_P_ELEM = new QName(OO_WPML_NS, "p"); public static final QName QNAME_R_ELEM = new QName(OO_WPML_NS, "r"); public static final QName QNAME_ROW_ELEM = new QName(SIMPLE_WP_NS, "row"); + public static final QName QNAME_SECTION_ELEM = new QName(SIMPLE_WP_NS, "section"); public static final QName QNAME_T_ELEM = new QName(OO_WPML_NS, "t"); // w:t -- text element public static final QName QNAME_TABLE_ELEM = new QName(SIMPLE_WP_NS, "table"); // w:table -- table element public static final QName QNAME_THEAD_ELEM = new QName(SIMPLE_WP_NS, "thead"); diff --git a/src/main/java/org/wordinator/xml2docx/generator/DocxGenerator.java b/src/main/java/org/wordinator/xml2docx/generator/DocxGenerator.java index 088d13d..9fa85c1 100644 --- a/src/main/java/org/wordinator/xml2docx/generator/DocxGenerator.java +++ b/src/main/java/org/wordinator/xml2docx/generator/DocxGenerator.java @@ -41,7 +41,9 @@ import org.apache.poi.ss.formula.eval.NotImplementedException; import org.apache.poi.util.Units; import org.apache.poi.wp.usermodel.HeaderFooterType; +import org.apache.poi.xwpf.usermodel.BodyElementType; import org.apache.poi.xwpf.usermodel.BreakType; +import org.apache.poi.xwpf.usermodel.IBodyElement; import org.apache.poi.xwpf.usermodel.ParagraphAlignment; import org.apache.poi.xwpf.usermodel.UnderlinePatterns; import org.apache.poi.xwpf.usermodel.XWPFAbstractFootnoteEndnote; @@ -451,7 +453,7 @@ private void constructDoc(XWPFDocument doc, XmlObject xml) throws DocxGeneration } cursor.pop(); cursor.push(); - cursor.toChild(new QName(DocxConstants.SIMPLE_WP_NS, "body")); + cursor.toChild(DocxConstants.QNAME_BODY_ELEM); setDocSettings(doc, xml); handleBody(doc, cursor.getObject()); @@ -464,13 +466,79 @@ private void constructDoc(XWPFDocument doc, XmlObject xml) throws DocxGeneration } else { CTDocument1 document = doc.getDocument(); CTBody body = (document.isSetBody() ? document.getBody() : document.addNewBody()); - @SuppressWarnings("unused") - CTSectPr sectPr = (body.isSetSectPr() ? body.getSectPr() : body.addNewSectPr()); + if (body.isSetSectPr()) { + body.getSectPr(); + } else { + body.addNewSectPr(); + } // At this point let Word fill in the details. } cursor.pop(); + // if the document has multiple sections we need to move the section + // properties from the last paragraph to directly within the body + XWPFParagraph lastPara = getLastParagraph(doc); + if (hasMultipleSections(xml) && lastPara != null && lastPara.getCTPPr().isSetSectPr()) { + CTSectPr sectPr = lastPara.getCTPPr().getSectPr(); + CTBody body = doc.getDocument().getBody(); + mergeSectPrs(body.getSectPr(), sectPr); + lastPara.getCTPPr().unsetSectPr(); + } + } + + private boolean hasMultipleSections(XmlObject xml) { + XmlCursor cursor = xml.newCursor(); + cursor.toFirstChild(); // go to root element + + if (!cursor.toChild(DocxConstants.QNAME_BODY_ELEM)) { + return false; + } + if (!cursor.toFirstChild()) { + return false; + } + int sections = cursor.getName().equals(DocxConstants.QNAME_SECTION_ELEM) ? 1 : 0; + while (cursor.toNextSibling() && sections < 2) { + if (cursor.getName().equals(DocxConstants.QNAME_SECTION_ELEM)) { + sections++; + } + } + return sections >= 2; + } + + private XWPFParagraph getLastParagraph(XWPFDocument doc) { + XWPFParagraph lastPara = null; + for (IBodyElement elem : doc.getBodyElements()) { + if (elem.getElementType() == BodyElementType.PARAGRAPH) { + lastPara = (XWPFParagraph) elem; + } + } + return lastPara; + } + + // this method does not merge all section properties, but I hope it + // does merge those that wordinator actually sets + private void mergeSectPrs(CTSectPr toSectPr, CTSectPr fromSectPr) { + if (fromSectPr.isSetPgMar()) { + toSectPr.setPgMar(fromSectPr.getPgMar()); + } + if (fromSectPr.isSetPgSz()) { + toSectPr.setPgSz(fromSectPr.getPgSz()); + } + if (fromSectPr.isSetPgNumType()) { + toSectPr.setPgNumType(fromSectPr.getPgNumType()); + } + + for (CTHdrFtrRef ref : fromSectPr.getHeaderReferenceList()) { + int ix = toSectPr.getHeaderReferenceList().size(); + toSectPr.insertNewHeaderReference(ix); + toSectPr.setHeaderReferenceArray(ix, ref); + } + for (CTHdrFtrRef ref : fromSectPr.getFooterReferenceList()) { + int ix = toSectPr.getFooterReferenceList().size(); + toSectPr.insertNewFooterReference(ix); + toSectPr.setFooterReferenceArray(ix, ref); + } } /**