diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml
index 4da567d0ad..7fefaa7c53 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/pom.xml
@@ -49,6 +49,10 @@
asm
${asm.version}
+
+ org.apache.commons
+ commons-lang3
+
com.epam
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/UniversalExecutableParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/UniversalExecutableParser.java
similarity index 76%
rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/UniversalExecutableParser.java
rename to tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/UniversalExecutableParser.java
index 4877429119..20e12a564c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/UniversalExecutableParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/UniversalExecutableParser.java
@@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.pkg;
+package org.apache.tika.parser.executable;
import java.io.IOException;
import java.io.InputStream;
@@ -49,6 +49,9 @@ public class UniversalExecutableParser implements Parser {
private static final Set SUPPORTED_TYPES =
Collections.singleton(MediaType.application("x-mach-o-universal"));
+ private static final int MAX_ARCHS_COUNT = 1000;
+ private static final int MAX_ARCH_SIZE = 500_000_000;//arbitrary
+
@Override
public Set getSupportedTypes(ParseContext arg0) {
return SUPPORTED_TYPES;
@@ -69,11 +72,11 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
if ((first4[0] == (byte) 0xBF || first4[0] == (byte) 0xBE) &&
first4[1] == (byte) 0xBA && first4[2] == (byte) 0xFE && first4[3] == (byte) 0xCA) {
- parseMachO(xhtml, extractor, handler, metadata, stream, first4);
+ parseMachO(xhtml, extractor, metadata, stream, first4);
} else if (first4[0] == (byte) 0xCA && first4[1] == (byte) 0xFE &&
first4[2] == (byte) 0xBA &&
(first4[3] == (byte) 0xBF || first4[3] == (byte) 0xBE)) {
- parseMachO(xhtml, extractor, handler, metadata, stream, first4);
+ parseMachO(xhtml, extractor, metadata, stream, first4);
} else {
throw new UnsupportedFormatException("Not a universal executable file");
}
@@ -85,32 +88,34 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
* Parses a Mach-O Universal file
*/
public void parseMachO(XHTMLContentHandler xhtml, EmbeddedDocumentExtractor extractor,
- ContentHandler handler, Metadata metadata, InputStream stream,
+ Metadata metadata, InputStream stream,
byte[] first4)
throws IOException, SAXException, TikaException {
var currentOffset = (long) first4.length;
var isLE = first4[3] == (byte) 0xCA;
var is64 = first4[isLE ? 0 : 3] == (byte) 0xBF;
- var archStructSize = 4 /* cputype */ + 4 /* cpusubtype */ + (is64
+ int archStructSize = 4 /* cputype */ + 4 /* cpusubtype */ + (is64
? 8 /* offset */ + 8 /* size */ + 4 /* align */ + 4 /* reserved */
: 4 /* offset */ + 4 /* size */ + 4 /* align */);
- var archsCount = isLE ? EndianUtils.readIntLE(stream) : EndianUtils.readIntBE(stream);
+ int archsCount = isLE ? EndianUtils.readIntLE(stream) : EndianUtils.readIntBE(stream);
if (archsCount < 1) {
throw new TikaException("Invalid number of architectures: " + archsCount);
}
+ if (archsCount > MAX_ARCHS_COUNT) {
+ throw new TikaException("Number of architectures=" + archsCount + " greater than max allowed=" + MAX_ARCHS_COUNT);
+ }
+
currentOffset += 4;
- var archsSize = archsCount * archStructSize;
+ long archsSize = (long) archsCount * archStructSize;
var unsortedOffsets = false;
var offsetAndSizePerArch = new Pair[archsCount];
- for (var archIndex = 0; archIndex < archsCount; archIndex++) {
- if (stream.skip(8) != 8) {
- throw new TikaException("Failed to skip cputype and cpusubtype");
- }
+ for (int archIndex = 0; archIndex < archsCount; archIndex++) {
+ IOUtils.skipFully(stream, 8);
- var offset = is64
+ long offset = is64
? (isLE ? EndianUtils.readLongLE(stream) : EndianUtils.readLongBE(stream))
: (isLE ? EndianUtils.readIntLE(stream) : EndianUtils.readIntBE(stream));
if (offset < 4 + 4 + archsSize) {
@@ -119,20 +124,19 @@ public void parseMachO(XHTMLContentHandler xhtml, EmbeddedDocumentExtractor extr
if (!unsortedOffsets && archIndex > 0 && offset < (long) offsetAndSizePerArch[archIndex - 1].getLeft()) {
unsortedOffsets = true;
}
- var size = is64
+ long size = is64
? (isLE ? EndianUtils.readLongLE(stream) : EndianUtils.readLongBE(stream))
: (isLE ? EndianUtils.readIntLE(stream) : EndianUtils.readIntBE(stream));
+ if (size < 0 || size > MAX_ARCH_SIZE) {
+ throw new TikaException("Arch size=" + size + " must be > 0 and < " + MAX_ARCH_SIZE);
+ }
offsetAndSizePerArch[archIndex] = Pair.of(offset, size);
if (is64) {
- if (stream.skip(8) != 8) {
- throw new TikaException("Failed to skip align and reserved");
- }
+ IOUtils.skipFully(stream, 8);
} else {
- if (stream.skip(4) != 4) {
- throw new TikaException("Failed to skip align");
- }
+ IOUtils.skipFully(stream, 4);
}
currentOffset += archStructSize;
@@ -141,24 +145,22 @@ public void parseMachO(XHTMLContentHandler xhtml, EmbeddedDocumentExtractor extr
Arrays.sort(offsetAndSizePerArch, Comparator.comparingLong(entry -> (long) entry.getLeft()));
}
- for (var archIndex = 0; archIndex < archsCount; archIndex++) {
- var skipUntilStart = (long) offsetAndSizePerArch[archIndex].getLeft() - currentOffset;
- if (stream.skip(skipUntilStart) != skipUntilStart) {
- throw new TikaException("Failed to skip to the start of the per-architecture Mach-O");
- }
+ for (int archIndex = 0; archIndex < archsCount; archIndex++) {
+ long skipUntilStart = (long)offsetAndSizePerArch[archIndex].getLeft() - currentOffset;
+ IOUtils.skipFully(stream, skipUntilStart);
currentOffset += skipUntilStart;
-
- var perArchMachO = new byte[(int) (long) offsetAndSizePerArch[archIndex].getRight()];
- if (stream.read(perArchMachO) != perArchMachO.length) {
- throw new TikaException("Failed to read the per-architecture Mach-O");
- }
+ long sz = (long)offsetAndSizePerArch[archIndex].getRight();
+ //we bounds checked this above.
+ byte[] perArchMachO = new byte[(int)sz];
+ IOUtils.readFully(stream, perArchMachO);
currentOffset += perArchMachO.length;
var perArchMetadata = new Metadata();
var tikaInputStream = TikaInputStream.get(perArchMachO, perArchMetadata);
if (extractor.shouldParseEmbedded(perArchMetadata)) {
- extractor.parseEmbedded(tikaInputStream, handler, perArchMetadata, true);
+ extractor.parseEmbedded(tikaInputStream, xhtml, perArchMetadata, true);
}
}
}
+
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index d15be7a232..9b73c5db10 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -16,6 +16,7 @@
org.apache.tika.parser.asm.ClassParser
org.apache.tika.parser.code.SourceCodeParser
org.apache.tika.parser.executable.ExecutableParser
+org.apache.tika.parser.executable.UniversalExecutableParser
org.apache.tika.parser.mat.MatParser
org.apache.tika.parser.sas.SAS7BDATParser
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/UniversalExecutableParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/executable/UniversalExecutableParserTest.java
similarity index 54%
rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/UniversalExecutableParserTest.java
rename to tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/executable/UniversalExecutableParserTest.java
index 706671338c..a123f6c453 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/UniversalExecutableParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/java/org/apache/tika/parser/executable/UniversalExecutableParserTest.java
@@ -14,41 +14,29 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-package org.apache.tika.parser.pkg;
+package org.apache.tika.parser.executable;
import static org.junit.jupiter.api.Assertions.assertEquals;
-import static org.junit.jupiter.api.Assertions.assertNull;
-import java.io.InputStream;
+import java.util.List;
import org.junit.jupiter.api.Test;
-import org.xml.sax.ContentHandler;
+import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.sax.BodyContentHandler;
/**
* Test case for parsing universal executable files.
*/
-public class UniversalExecutableParserTest extends AbstractPkgTest {
+public class UniversalExecutableParserTest extends TikaTest {
@Test
public void testMachO() throws Exception {
- ContentHandler handler = new BodyContentHandler();
- Metadata metadata = new Metadata();
-
- try (InputStream stream = getResourceAsStream("/test-documents/testMacOS-x86_64-arm64")) {
- AUTO_DETECT_PARSER.parse(stream, handler, metadata, monitoringContext);
- }
-
- assertEquals(2, monitor.filenames.size());
- assertEquals(2, monitor.mediaTypes.size());
-
- for (String filename : monitor.filenames) {
- assertNull(filename);
- }
- for (String mediaType : monitor.mediaTypes) {
- assertEquals("application/x-mach-o-executable", mediaType);
+ List metadataList = getRecursiveMetadata("testMacOS-x86_64-arm64");
+ assertEquals(3, metadataList.size());
+ assertEquals("application/x-mach-o-universal", metadataList.get(0).get(Metadata.CONTENT_TYPE));
+ for (int i = 1; i < 3; i++) {
+ assertEquals("application/x-mach-o-executable", metadataList.get(i).get(Metadata.CONTENT_TYPE));
}
}
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testMacOS-x86_64-arm64 b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/resources/test-documents/testMacOS-x86_64-arm64
similarity index 100%
rename from tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/testMacOS-x86_64-arm64
rename to tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/test/resources/test-documents/testMacOS-x86_64-arm64
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
index 3c604dcc5a..89ce8c87b8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
@@ -16,4 +16,3 @@
org.apache.tika.parser.pkg.CompressorParser
org.apache.tika.parser.pkg.PackageParser
org.apache.tika.parser.pkg.RarParser
-org.apache.tika.parser.pkg.UniversalExecutableParser