Skip to content

Commit

Permalink
FEAT: Text stream validation for literals
Browse files Browse the repository at this point in the history
- added `PdfTextStream` class to handle:
  - detection of a text stream between `BT` and `ET` operators;
  - distinguish between hex streams (not validated yet) and string literals;
  - balance parenthesis in string literals while accounting for escaped characters;
- added two new PDF error messages:
  - `PDF-HUL-163` IO Exception reading text stream;
  - `PDF-HUL-164` Unbalanced parentheses in text stream;
- added a first cut function for walking pagesfor validation: `PdfModule:checkPageTextStreams`;
- added method to check text streams: `PageObject:checkTextStreams`;
- tidied up page content stream handling, empty lists are safer than nulls;
- check page text streams after font finding;
- removed unnecessary param from filter extraction;
- fixed minor issue in header handling that terminated processing early for invalid files; and
- added test files for the above.
  • Loading branch information
carlwilson committed Jan 24, 2025
1 parent 4fd44c2 commit a28bad1
Show file tree
Hide file tree
Showing 12 changed files with 220 additions and 63 deletions.
11 changes: 11 additions & 0 deletions jhove-bbt/scripts/create-1.33-target.sh
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,14 @@ do
cp "${candidateRoot}/${filename}" "${targetRoot}/${filename}"
fi
done

# Copy the results of the test files fixed by the addition of basic text stream validation
declare -a pdf_version_affected=("errors/modules/PDF-hul/T02-05-01_009_Missing_open_paranthesis.pdf.jhove.xml"
"errors/modules/PDF-hul/T02-05-01_010_Missing_closing_paranthesis.pdf.jhove.xml"
"errors/modules/PDF-hul/T02-05-01_011_paranthesis-substituted-with-brackets.pdf.jhove.xml")
for filename in "${pdf_version_affected[@]}"
do
if [[ -f "${candidateRoot}/${filename}" ]]; then
cp "${candidateRoot}/${filename}" "${targetRoot}/${filename}"
fi
done
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,7 @@
import edu.harvard.hul.ois.jhove.module.pdf.PdfSimpleObject;
import edu.harvard.hul.ois.jhove.module.pdf.PdfStream;
import edu.harvard.hul.ois.jhove.module.pdf.PdfStrings;
import edu.harvard.hul.ois.jhove.module.pdf.PdfTextStream;
import edu.harvard.hul.ois.jhove.module.pdf.PdfXMPSource;
import edu.harvard.hul.ois.jhove.module.pdf.StringValuedToken;
import edu.harvard.hul.ois.jhove.module.pdf.TaggedProfile;
Expand Down Expand Up @@ -867,9 +868,11 @@ public final void parse(RandomAccessFile raf, RepInfo info)
}
findImages(info);
findFonts(info);
checkPageTextStreams(info);

/* Object is well-formed PDF. */


// Calculate checksums if not already present
checksumIfRafNotCopied(info, raf);

Expand Down Expand Up @@ -1050,12 +1053,13 @@ protected boolean parseHeader(RepInfo info) {
try {
header = PdfHeader.parseHeader(_parser);
} catch (PdfException e) {
info.setMessage(new ErrorMessage(e.getJhoveMessage(), 0L)); // PDF-HUL-155
if (e instanceof PdfInvalidException) {
info.setValid(false);
return true;
} else {
info.setWellFormed(false);
}
info.setMessage(new ErrorMessage(e.getJhoveMessage(), 0L)); // PDF-HUL-155
return false;
}
_version = header.getVersionString();
Expand Down Expand Up @@ -2195,17 +2199,14 @@ protected void findExternalStreams(RepInfo info) throws IOException {
break;
}
// Get the streams for the page and walk through them
List<PdfStream> streams = page.getContentStreams();
if (streams != null) {
ListIterator<PdfStream> streamIter = streams.listIterator();
while (streamIter.hasNext()) {
PdfStream stream = streamIter.next();
String specStr = stream.getFileSpecification();
if (specStr != null) {
Property prop = new Property(PROP_NAME_FILE,
PropertyType.STRING, specStr);
_extStreamsList.add(prop);
}
ListIterator<PdfStream> streamIter = page.getContentStreams().listIterator();
while (streamIter.hasNext()) {
PdfStream stream = streamIter.next();
String specStr = stream.getFileSpecification();
if (specStr != null) {
Property prop = new Property(PROP_NAME_FILE,
PropertyType.STRING, specStr);
_extStreamsList.add(prop);
}
}
}
Expand Down Expand Up @@ -2244,14 +2245,11 @@ protected boolean findFilters(RepInfo info) throws IOException {
break;
}
// Get the streams for the page and walk through them
List<PdfStream> streams = page.getContentStreams();
if (streams != null) {
ListIterator<PdfStream> streamIter = streams.listIterator();
while (streamIter.hasNext()) {
PdfStream stream = streamIter.next();
Filter[] filters = stream.getFilters();
extractFilters(filters, stream);
}
ListIterator<PdfStream> streamIter = page.getContentStreams().listIterator();
while (streamIter.hasNext()) {
PdfStream stream = streamIter.next();
Filter[] filters = stream.getFilters();
extractFilters(filters);
}
}
} catch (PdfException e) {
Expand All @@ -2271,7 +2269,7 @@ protected boolean findFilters(RepInfo info) throws IOException {
* Returns the filter string whether it's added or not,
* or null if there are no filters.
*/
protected String extractFilters(Filter[] filters, PdfStream stream) {
protected String extractFilters(Filter[] filters) {
/*
* Concatenate the names into a string of names separated
* by spaces.
Expand Down Expand Up @@ -2390,8 +2388,7 @@ protected void findImages(RepInfo info) throws IOException {
String mimeType = imageMimeFromFilters(
filters);
niso.setMimeType(mimeType);
String filt = extractFilters(filters,
(PdfStream) xob);
String filt = extractFilters(filters);
if (filt != null) {
// If the filter is one which the NISO
// schema
Expand Down Expand Up @@ -3510,11 +3507,45 @@ protected void addDestination(PdfObject itemObj, String propName,
}
}

protected void checkPageTextStreams(final RepInfo info) {
if (_encrypted) {
// Don't bother trying to check text streams if the file is encrypted
return;
}
_docTreeRoot.startWalk();
try {
for (;;) {
// Get all the page objects in the document sequentially
PageObject page = _docTreeRoot.nextPageObject();
if (page == null) {
break;
}
// Get the streams for the page and walk through them
ListIterator<PdfStream> streamIter = page.getContentStreams().listIterator();
while (streamIter.hasNext()) {
PdfTextStream textStream = new PdfTextStream(streamIter.next(), _raf);
textStream.validate();
}
}
} catch (PdfException e) {
e.disparage(info);
info.setMessage(new ErrorMessage(e.getJhoveMessage()));
} catch (IOException e) {
info.setWellFormed(false);
String subMess = e.getMessage();
JhoveMessage message = JhoveMessages.getMessageInstance(
MessageConstants.PDF_HUL_163, subMess);
info.setMessage(new ErrorMessage(message)); // PDF-HUL-102
} catch (NegativeArraySizeException e) {
// Do nothing with this now as it seems to be a bug in the xref stream handler
}
}

/*
* Build up a property for one of the kinds of fonts
* in the file.
*/
protected Property buildFontProperty(String name, Map map, int fontType) {
protected Property buildFontProperty(String name, Map<Integer, PdfObject> map, int fontType) {
List<Property> fontList = new LinkedList<Property>(); // list of fonts
Iterator<PdfObject> fontIter = map.values().iterator();
while (fontIter.hasNext()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -615,47 +615,43 @@ private boolean resourcesOK ()

// Check content streams for resources
if (docNode instanceof PageObject) {
List<PdfStream> streams =
((PageObject) docNode).getContentStreams ();
if (streams != null) {
Iterator<PdfStream> iter = streams.listIterator ();
while (iter.hasNext ()) {
PdfStream stream = iter.next ();
PdfDictionary dict = stream.getDict ();
PdfDictionary rs =
(PdfDictionary)
_module.resolveIndirectObject(dict.get ("Resources"));
if (rs != null) {
PdfDictionary cs = (PdfDictionary)
_module.resolveIndirectObject
(rs.get ("ColorSpace"));
if (!colorSpaceOK (cs)) {
return false;
}

PdfDictionary gs = (PdfDictionary)
_module.resolveIndirectObject
(rs.get ("ExtGState"));
if (!extGStateOK (gs)) {
return false;
}
ListIterator<PdfStream> streamIter = ((PageObject) docNode).getContentStreams().listIterator();
while (streamIter.hasNext ()) {
PdfStream stream = streamIter.next ();
PdfDictionary dict = stream.getDict ();
PdfDictionary rs =
(PdfDictionary)
_module.resolveIndirectObject(dict.get ("Resources"));
if (rs != null) {
PdfDictionary cs = (PdfDictionary)
_module.resolveIndirectObject
(rs.get ("ColorSpace"));
if (!colorSpaceOK (cs)) {
return false;
}

PdfDictionary xo = (PdfDictionary)
_module.resolveIndirectObject
(rs.get ("XObject"));
if (!xObjectsOK (xo)) {
return false;
}
PdfDictionary gs = (PdfDictionary)
_module.resolveIndirectObject
(rs.get ("ExtGState"));
if (!extGStateOK (gs)) {
return false;
}
// Also check for filters
PdfObject filters =
dict.get ("Filter");
if (hasFilters (filters, excludedFilters)) {

PdfDictionary xo = (PdfDictionary)
_module.resolveIndirectObject
(rs.get ("XObject"));
if (!xObjectsOK (xo)) {
return false;
}
}
// Also check for filters
PdfObject filters =
dict.get ("Filter");
if (hasFilters (filters, excludedFilters)) {
return false;
}
}

// Also check page objects for annotations.
// Must be one of the prescribed types, but not
// Movie, Sound, or FileAttachment.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@ public enum MessageConstants {
public static final JhoveMessage PDF_HUL_160 = messageFactory.getMessage("PDF-HUL-160");
public static final JhoveMessage PDF_HUL_161 = messageFactory.getMessage("PDF-HUL-161");
public static final JhoveMessage PDF_HUL_162 = messageFactory.getMessage("PDF-HUL-162");
public static final JhoveMessage PDF_HUL_163 = messageFactory.getMessage("PDF-HUL-163");
public static final JhoveMessage PDF_HUL_164 = messageFactory.getMessage("PDF-HUL-164");

/**
* Logger Messages
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
package edu.harvard.hul.ois.jhove.module.pdf;

import java.io.IOException;
import java.io.RandomAccessFile;
import java.util.ArrayList;
import java.util.List;
import java.util.Vector;
Expand All @@ -18,7 +19,7 @@
*/
public class PageObject extends DocNode
{
private List<PdfStream> _contentStreams = null; // contents of the page; may be null
private List<PdfStream> _contentStreams = new ArrayList<>(); // contents of the page, defaults to empty

/**
* Superclass constructor.
Expand Down Expand Up @@ -149,6 +150,18 @@ public PdfArray getBleedBox () throws PdfException
MessageConstants.PDF_HUL_25); // PDF-HUL-25
}

public void checkTextStreams(RandomAccessFile raf) throws IOException {
if (_contentStreams == null) {
return;
}
for (PdfStream pdfStream : _contentStreams) {
Stream stream = pdfStream.getStream();
byte[] data = new byte[(int) stream.getLength()];
stream.initRead(raf);
stream.read(data);
}
}

private static PdfArray retrieveAndCheckRectangle(final PdfDictionary dict,
final String dictKey, final JhoveMessage invalidMessage) throws PdfInvalidException {
PdfArray mbox = null;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
package edu.harvard.hul.ois.jhove.module.pdf;

import java.io.IOException;
import java.io.RandomAccessFile;
import java.nio.charset.StandardCharsets;
import java.util.Arrays;

public class PdfTextStream {
private final byte[] streamData;

public PdfTextStream(PdfStream pdfStream, final RandomAccessFile raf) throws IOException {
Stream stream = pdfStream.getStream();
stream.initRead(raf);
this.streamData = stream.getRawData();
}

public void validate() throws PdfInvalidException {
String chars = new String(this.streamData, StandardCharsets.UTF_8);
if (isTextStream(chars) && !isHexStream(chars)) {
int textStart = chars.indexOf("BT");
int parenthesesCount = 0;
char lastChar = 00;
boolean parenthesesFound = false;
for (int i = textStart; i < chars.lastIndexOf("ET"); i++) {
if (isReverseSolidus(lastChar)) {
// Ignore characters that are escaped, these may be legitimate unbalanced parentheses
lastChar = chars.charAt(i);
continue;
}
if (chars.charAt(i) == '(') {
parenthesesFound = true;
parenthesesCount++;
} else if (chars.charAt(i) == ')') {
parenthesesFound = true;
parenthesesCount--;
}
lastChar = chars.charAt(i);
}
if (parenthesesCount != 0 || !parenthesesFound) {
// The number of opening and closing braces don't match
// This is a sign that the text stream is malformed
throw new PdfInvalidException(MessageConstants.PDF_HUL_164); // PDF-HUL-123
}
}
}

private boolean isReverseSolidus(final char toValidate) {
// Returns true if the character is a reverse solidus, false otherwise
return toValidate == '\\';
}

private boolean isTextStream(final String toValidate) {
// Check if the stream is a text stream
if (toValidate.contains("BT") && toValidate.contains("ET")) {
// It contains both a begin and end text operator, get the first occurrence of BT and the last occurrence of ET
int firstBt = toValidate.indexOf("BT");
int lastEt = toValidate.lastIndexOf("ET");
if ((firstBt < lastEt) &&
(firstBt == 0 || (Arrays.stream(Tokenizer.WHITESPACES).anyMatch(Character.valueOf(toValidate.charAt(firstBt -1))::equals))) &&
(Arrays.stream(Tokenizer.WHITESPACES).anyMatch(Character.valueOf(toValidate.charAt(lastEt -1))::equals))) {
// Checks that the first occurrence of BT is before the last occurrence of ET, AND that the BT is the first char in the string OR
// the character before BT is a whitespace character, AND that the character before ET is a whitespace character.
// If so then return true
return true;
}
}
return false;
}

private boolean isHexStream(final String toValidate) {
// Check if the stream is a hex text stream, denoted by the presence of < and > rather than ( and )
char lastChar = 00;
for (int i = toValidate.indexOf("BT"); i < toValidate.lastIndexOf("ET"); i++) {
// Work between the BT and ET operators
if (toValidate.charAt(i) == '(') {
// We found an opening parenthesis, check if it's escaped
if (!isReverseSolidus(lastChar)) {
// If it's not escaped then it's a literal text string comprised of characters
return false;
}
} else if (toValidate.charAt(i) == '<') {
// We found an opening angle bracket, check if it's escaped
if (!isReverseSolidus(lastChar)) {
// If it's not escaped then it's a hex string comprised of pairs of hexadecimal digits
return true;
}
}
lastChar = toValidate.charAt(i);
}
// If we reach this point then the stream is not a hex stream
return false;
}

}
Loading

0 comments on commit a28bad1

Please sign in to comment.