diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml index 4da567d0ad..7fefaa7c53 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml @@ -49,6 +49,10 @@ asm ${asm.version} + + org.apache.commons + commons-lang3 + com.epam diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/UniversalExecutableParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/UniversalExecutableParser.java similarity index 76% rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/UniversalExecutableParser.java rename to tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/UniversalExecutableParser.java index 4877429119..20e12a564c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/UniversalExecutableParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/UniversalExecutableParser.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.parser.pkg; +package org.apache.tika.parser.executable; import java.io.IOException; import java.io.InputStream; @@ -49,6 +49,9 @@ public class UniversalExecutableParser implements Parser { private static final Set SUPPORTED_TYPES = Collections.singleton(MediaType.application("x-mach-o-universal")); + private static final int MAX_ARCHS_COUNT = 1000; + private static final int MAX_ARCH_SIZE = 500_000_000;//arbitrary + @Override public Set getSupportedTypes(ParseContext arg0) { return SUPPORTED_TYPES; @@ -69,11 +72,11 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, if ((first4[0] == (byte) 0xBF || first4[0] == (byte) 0xBE) && first4[1] == (byte) 0xBA && first4[2] == (byte) 0xFE && first4[3] == (byte) 0xCA) { - parseMachO(xhtml, extractor, handler, metadata, stream, first4); + parseMachO(xhtml, extractor, metadata, stream, first4); } else if (first4[0] == (byte) 0xCA && first4[1] == (byte) 0xFE && first4[2] == (byte) 0xBA && (first4[3] == (byte) 0xBF || first4[3] == (byte) 0xBE)) { - parseMachO(xhtml, extractor, handler, metadata, stream, first4); + parseMachO(xhtml, extractor, metadata, stream, first4); } else { throw new UnsupportedFormatException("Not a universal executable file"); } @@ -85,32 +88,34 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, * Parses a Mach-O Universal file */ public void parseMachO(XHTMLContentHandler xhtml, EmbeddedDocumentExtractor extractor, - ContentHandler handler, Metadata metadata, InputStream stream, + Metadata metadata, InputStream stream, byte[] first4) throws IOException, SAXException, TikaException { var currentOffset = (long) first4.length; var isLE = first4[3] == (byte) 0xCA; var is64 = first4[isLE ? 0 : 3] == (byte) 0xBF; - var archStructSize = 4 /* cputype */ + 4 /* cpusubtype */ + (is64 + int archStructSize = 4 /* cputype */ + 4 /* cpusubtype */ + (is64 ? 8 /* offset */ + 8 /* size */ + 4 /* align */ + 4 /* reserved */ : 4 /* offset */ + 4 /* size */ + 4 /* align */); - var archsCount = isLE ? EndianUtils.readIntLE(stream) : EndianUtils.readIntBE(stream); + int archsCount = isLE ? EndianUtils.readIntLE(stream) : EndianUtils.readIntBE(stream); if (archsCount < 1) { throw new TikaException("Invalid number of architectures: " + archsCount); } + if (archsCount > MAX_ARCHS_COUNT) { + throw new TikaException("Number of architectures=" + archsCount + " greater than max allowed=" + MAX_ARCHS_COUNT); + } + currentOffset += 4; - var archsSize = archsCount * archStructSize; + long archsSize = (long) archsCount * archStructSize; var unsortedOffsets = false; var offsetAndSizePerArch = new Pair[archsCount]; - for (var archIndex = 0; archIndex < archsCount; archIndex++) { - if (stream.skip(8) != 8) { - throw new TikaException("Failed to skip cputype and cpusubtype"); - } + for (int archIndex = 0; archIndex < archsCount; archIndex++) { + IOUtils.skipFully(stream, 8); - var offset = is64 + long offset = is64 ? (isLE ? EndianUtils.readLongLE(stream) : EndianUtils.readLongBE(stream)) : (isLE ? EndianUtils.readIntLE(stream) : EndianUtils.readIntBE(stream)); if (offset < 4 + 4 + archsSize) { @@ -119,20 +124,19 @@ public void parseMachO(XHTMLContentHandler xhtml, EmbeddedDocumentExtractor extr if (!unsortedOffsets && archIndex > 0 && offset < (long) offsetAndSizePerArch[archIndex - 1].getLeft()) { unsortedOffsets = true; } - var size = is64 + long size = is64 ? (isLE ? EndianUtils.readLongLE(stream) : EndianUtils.readLongBE(stream)) : (isLE ? EndianUtils.readIntLE(stream) : EndianUtils.readIntBE(stream)); + if (size < 0 || size > MAX_ARCH_SIZE) { + throw new TikaException("Arch size=" + size + " must be > 0 and < " + MAX_ARCH_SIZE); + } offsetAndSizePerArch[archIndex] = Pair.of(offset, size); if (is64) { - if (stream.skip(8) != 8) { - throw new TikaException("Failed to skip align and reserved"); - } + IOUtils.skipFully(stream, 8); } else { - if (stream.skip(4) != 4) { - throw new TikaException("Failed to skip align"); - } + IOUtils.skipFully(stream, 4); } currentOffset += archStructSize; @@ -141,24 +145,22 @@ public void parseMachO(XHTMLContentHandler xhtml, EmbeddedDocumentExtractor extr Arrays.sort(offsetAndSizePerArch, Comparator.comparingLong(entry -> (long) entry.getLeft())); } - for (var archIndex = 0; archIndex < archsCount; archIndex++) { - var skipUntilStart = (long) offsetAndSizePerArch[archIndex].getLeft() - currentOffset; - if (stream.skip(skipUntilStart) != skipUntilStart) { - throw new TikaException("Failed to skip to the start of the per-architecture Mach-O"); - } + for (int archIndex = 0; archIndex < archsCount; archIndex++) { + long skipUntilStart = (long)offsetAndSizePerArch[archIndex].getLeft() - currentOffset; + IOUtils.skipFully(stream, skipUntilStart); currentOffset += skipUntilStart; - - var perArchMachO = new byte[(int) (long) offsetAndSizePerArch[archIndex].getRight()]; - if (stream.read(perArchMachO) != perArchMachO.length) { - throw new TikaException("Failed to read the per-architecture Mach-O"); - } + long sz = (long)offsetAndSizePerArch[archIndex].getRight(); + //we bounds checked this above. + byte[] perArchMachO = new byte[(int)sz]; + IOUtils.readFully(stream, perArchMachO); currentOffset += perArchMachO.length; var perArchMetadata = new Metadata(); var tikaInputStream = TikaInputStream.get(perArchMachO, perArchMetadata); if (extractor.shouldParseEmbedded(perArchMetadata)) { - extractor.parseEmbedded(tikaInputStream, handler, perArchMetadata, true); + extractor.parseEmbedded(tikaInputStream, xhtml, perArchMetadata, true); } } } + } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser index d15be7a232..9b73c5db10 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser @@ -16,6 +16,7 @@ org.apache.tika.parser.asm.ClassParser org.apache.tika.parser.code.SourceCodeParser org.apache.tika.parser.executable.ExecutableParser +org.apache.tika.parser.executable.UniversalExecutableParser org.apache.tika.parser.mat.MatParser org.apache.tika.parser.sas.SAS7BDATParser diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/UniversalExecutableParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/executable/UniversalExecutableParserTest.java similarity index 54% rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/UniversalExecutableParserTest.java rename to tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/executable/UniversalExecutableParserTest.java index 706671338c..a123f6c453 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/UniversalExecutableParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/executable/UniversalExecutableParserTest.java @@ -14,41 +14,29 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.parser.pkg; +package org.apache.tika.parser.executable; import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNull; -import java.io.InputStream; +import java.util.List; import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; +import org.apache.tika.TikaTest; import org.apache.tika.metadata.Metadata; -import org.apache.tika.sax.BodyContentHandler; /** * Test case for parsing universal executable files. */ -public class UniversalExecutableParserTest extends AbstractPkgTest { +public class UniversalExecutableParserTest extends TikaTest { @Test public void testMachO() throws Exception { - ContentHandler handler = new BodyContentHandler(); - Metadata metadata = new Metadata(); - - try (InputStream stream = getResourceAsStream("/test-documents/testMacOS-x86_64-arm64")) { - AUTO_DETECT_PARSER.parse(stream, handler, metadata, monitoringContext); - } - - assertEquals(2, monitor.filenames.size()); - assertEquals(2, monitor.mediaTypes.size()); - - for (String filename : monitor.filenames) { - assertNull(filename); - } - for (String mediaType : monitor.mediaTypes) { - assertEquals("application/x-mach-o-executable", mediaType); + List metadataList = getRecursiveMetadata("testMacOS-x86_64-arm64"); + assertEquals(3, metadataList.size()); + assertEquals("application/x-mach-o-universal", metadataList.get(0).get(Metadata.CONTENT_TYPE)); + for (int i = 1; i < 3; i++) { + assertEquals("application/x-mach-o-executable", metadataList.get(i).get(Metadata.CONTENT_TYPE)); } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testMacOS-x86_64-arm64 b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/resources/test-documents/testMacOS-x86_64-arm64 similarity index 100% rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testMacOS-x86_64-arm64 rename to tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/resources/test-documents/testMacOS-x86_64-arm64 diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser index 3c604dcc5a..89ce8c87b8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser @@ -16,4 +16,3 @@ org.apache.tika.parser.pkg.CompressorParser org.apache.tika.parser.pkg.PackageParser org.apache.tika.parser.pkg.RarParser -org.apache.tika.parser.pkg.UniversalExecutableParser