Skip to content

Commit

Permalink
TIKA-4309 -- some mods (#1993)
Browse files Browse the repository at this point in the history
* TIKA-4309 -- some mods, add dependency on lang3 -- DRY on Pair; add bounds checking
  • Loading branch information
tballison authored Oct 16, 2024
1 parent 2fd84ee commit f8f9857
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 52 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@
<artifactId>asm</artifactId>
<version>${asm.version}</version>
</dependency>
<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
</dependency>

<dependency>
<groupId>com.epam</groupId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.pkg;
package org.apache.tika.parser.executable;

import java.io.IOException;
import java.io.InputStream;
Expand Down Expand Up @@ -49,6 +49,9 @@ public class UniversalExecutableParser implements Parser {
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.singleton(MediaType.application("x-mach-o-universal"));

private static final int MAX_ARCHS_COUNT = 1000;
private static final int MAX_ARCH_SIZE = 500_000_000;//arbitrary

@Override
public Set<MediaType> getSupportedTypes(ParseContext arg0) {
return SUPPORTED_TYPES;
Expand All @@ -69,11 +72,11 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,

if ((first4[0] == (byte) 0xBF || first4[0] == (byte) 0xBE) &&
first4[1] == (byte) 0xBA && first4[2] == (byte) 0xFE && first4[3] == (byte) 0xCA) {
parseMachO(xhtml, extractor, handler, metadata, stream, first4);
parseMachO(xhtml, extractor, metadata, stream, first4);
} else if (first4[0] == (byte) 0xCA && first4[1] == (byte) 0xFE &&
first4[2] == (byte) 0xBA &&
(first4[3] == (byte) 0xBF || first4[3] == (byte) 0xBE)) {
parseMachO(xhtml, extractor, handler, metadata, stream, first4);
parseMachO(xhtml, extractor, metadata, stream, first4);
} else {
throw new UnsupportedFormatException("Not a universal executable file");
}
Expand All @@ -85,32 +88,34 @@ public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
* Parses a Mach-O Universal file
*/
public void parseMachO(XHTMLContentHandler xhtml, EmbeddedDocumentExtractor extractor,
ContentHandler handler, Metadata metadata, InputStream stream,
Metadata metadata, InputStream stream,
byte[] first4)
throws IOException, SAXException, TikaException {
var currentOffset = (long) first4.length;
var isLE = first4[3] == (byte) 0xCA;
var is64 = first4[isLE ? 0 : 3] == (byte) 0xBF;
var archStructSize = 4 /* cputype */ + 4 /* cpusubtype */ + (is64
int archStructSize = 4 /* cputype */ + 4 /* cpusubtype */ + (is64
? 8 /* offset */ + 8 /* size */ + 4 /* align */ + 4 /* reserved */
: 4 /* offset */ + 4 /* size */ + 4 /* align */);

var archsCount = isLE ? EndianUtils.readIntLE(stream) : EndianUtils.readIntBE(stream);
int archsCount = isLE ? EndianUtils.readIntLE(stream) : EndianUtils.readIntBE(stream);
if (archsCount < 1) {
throw new TikaException("Invalid number of architectures: " + archsCount);
}
if (archsCount > MAX_ARCHS_COUNT) {
throw new TikaException("Number of architectures=" + archsCount + " greater than max allowed=" + MAX_ARCHS_COUNT);
}

currentOffset += 4;

var archsSize = archsCount * archStructSize;
long archsSize = (long) archsCount * archStructSize;

var unsortedOffsets = false;
var offsetAndSizePerArch = new Pair[archsCount];
for (var archIndex = 0; archIndex < archsCount; archIndex++) {
if (stream.skip(8) != 8) {
throw new TikaException("Failed to skip cputype and cpusubtype");
}
for (int archIndex = 0; archIndex < archsCount; archIndex++) {
IOUtils.skipFully(stream, 8);

var offset = is64
long offset = is64
? (isLE ? EndianUtils.readLongLE(stream) : EndianUtils.readLongBE(stream))
: (isLE ? EndianUtils.readIntLE(stream) : EndianUtils.readIntBE(stream));
if (offset < 4 + 4 + archsSize) {
Expand All @@ -119,20 +124,19 @@ public void parseMachO(XHTMLContentHandler xhtml, EmbeddedDocumentExtractor extr
if (!unsortedOffsets && archIndex > 0 && offset < (long) offsetAndSizePerArch[archIndex - 1].getLeft()) {
unsortedOffsets = true;
}
var size = is64
long size = is64
? (isLE ? EndianUtils.readLongLE(stream) : EndianUtils.readLongBE(stream))
: (isLE ? EndianUtils.readIntLE(stream) : EndianUtils.readIntBE(stream));

if (size < 0 || size > MAX_ARCH_SIZE) {
throw new TikaException("Arch size=" + size + " must be > 0 and < " + MAX_ARCH_SIZE);
}
offsetAndSizePerArch[archIndex] = Pair.of(offset, size);

if (is64) {
if (stream.skip(8) != 8) {
throw new TikaException("Failed to skip align and reserved");
}
IOUtils.skipFully(stream, 8);
} else {
if (stream.skip(4) != 4) {
throw new TikaException("Failed to skip align");
}
IOUtils.skipFully(stream, 4);
}

currentOffset += archStructSize;
Expand All @@ -141,24 +145,22 @@ public void parseMachO(XHTMLContentHandler xhtml, EmbeddedDocumentExtractor extr
Arrays.sort(offsetAndSizePerArch, Comparator.comparingLong(entry -> (long) entry.getLeft()));
}

for (var archIndex = 0; archIndex < archsCount; archIndex++) {
var skipUntilStart = (long) offsetAndSizePerArch[archIndex].getLeft() - currentOffset;
if (stream.skip(skipUntilStart) != skipUntilStart) {
throw new TikaException("Failed to skip to the start of the per-architecture Mach-O");
}
for (int archIndex = 0; archIndex < archsCount; archIndex++) {
long skipUntilStart = (long)offsetAndSizePerArch[archIndex].getLeft() - currentOffset;
IOUtils.skipFully(stream, skipUntilStart);
currentOffset += skipUntilStart;

var perArchMachO = new byte[(int) (long) offsetAndSizePerArch[archIndex].getRight()];
if (stream.read(perArchMachO) != perArchMachO.length) {
throw new TikaException("Failed to read the per-architecture Mach-O");
}
long sz = (long)offsetAndSizePerArch[archIndex].getRight();
//we bounds checked this above.
byte[] perArchMachO = new byte[(int)sz];
IOUtils.readFully(stream, perArchMachO);
currentOffset += perArchMachO.length;

var perArchMetadata = new Metadata();
var tikaInputStream = TikaInputStream.get(perArchMachO, perArchMetadata);
if (extractor.shouldParseEmbedded(perArchMetadata)) {
extractor.parseEmbedded(tikaInputStream, handler, perArchMetadata, true);
extractor.parseEmbedded(tikaInputStream, xhtml, perArchMetadata, true);
}
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
org.apache.tika.parser.asm.ClassParser
org.apache.tika.parser.code.SourceCodeParser
org.apache.tika.parser.executable.ExecutableParser
org.apache.tika.parser.executable.UniversalExecutableParser
org.apache.tika.parser.mat.MatParser
org.apache.tika.parser.sas.SAS7BDATParser

Original file line number Diff line number Diff line change
Expand Up @@ -14,41 +14,29 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.pkg;
package org.apache.tika.parser.executable;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;

import java.io.InputStream;
import java.util.List;

import org.junit.jupiter.api.Test;
import org.xml.sax.ContentHandler;

import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.sax.BodyContentHandler;

/**
* Test case for parsing universal executable files.
*/
public class UniversalExecutableParserTest extends AbstractPkgTest {
public class UniversalExecutableParserTest extends TikaTest {

@Test
public void testMachO() throws Exception {
ContentHandler handler = new BodyContentHandler();
Metadata metadata = new Metadata();

try (InputStream stream = getResourceAsStream("/test-documents/testMacOS-x86_64-arm64")) {
AUTO_DETECT_PARSER.parse(stream, handler, metadata, monitoringContext);
}

assertEquals(2, monitor.filenames.size());
assertEquals(2, monitor.mediaTypes.size());

for (String filename : monitor.filenames) {
assertNull(filename);
}
for (String mediaType : monitor.mediaTypes) {
assertEquals("application/x-mach-o-executable", mediaType);
List<Metadata> metadataList = getRecursiveMetadata("testMacOS-x86_64-arm64");
assertEquals(3, metadataList.size());
assertEquals("application/x-mach-o-universal", metadataList.get(0).get(Metadata.CONTENT_TYPE));
for (int i = 1; i < 3; i++) {
assertEquals("application/x-mach-o-executable", metadataList.get(i).get(Metadata.CONTENT_TYPE));
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,4 +16,3 @@
org.apache.tika.parser.pkg.CompressorParser
org.apache.tika.parser.pkg.PackageParser
org.apache.tika.parser.pkg.RarParser
org.apache.tika.parser.pkg.UniversalExecutableParser

0 comments on commit f8f9857

Please sign in to comment.