-
Notifications
You must be signed in to change notification settings - Fork 5.1k
[FormRecognizer] Preview 1: detect content type when not provided #11357
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
107 changes: 52 additions & 55 deletions
107
sdk/formrecognizer/Azure.AI.FormRecognizer/src/FormRecognizerClient.cs
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
4 changes: 0 additions & 4 deletions
4
sdk/formrecognizer/Azure.AI.FormRecognizer/src/ReceiptExtensions.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
92 changes: 92 additions & 0 deletions
92
sdk/formrecognizer/Azure.AI.FormRecognizer/src/StreamExtensions.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,92 @@ | ||
| // Copyright (c) Microsoft Corporation. All rights reserved. | ||
| // Licensed under the MIT License. | ||
|
|
||
| using System; | ||
| using System.IO; | ||
| using Azure.AI.FormRecognizer.Models; | ||
|
|
||
| namespace Azure.AI.FormRecognizer | ||
| { | ||
| internal static class StreamExtensions | ||
| { | ||
| /// <summary>The set of bytes expected to be present at the start of PDF files.</summary> | ||
| private static byte[] PdfHeader = new byte[] { 0x25, 0x50, 0x44, 0x46 }; | ||
|
|
||
| /// <summary>The set of bytes expected to be present at the start of PNG files.</summary> | ||
| private static byte[] PngHeader = new byte[] { 0x89, 0x50, 0x4E, 0x47 }; | ||
|
|
||
| /// <summary>The set of bytes expected to be present at the start of JPEG files.</summary> | ||
| private static byte[] JpegHeader = new byte[] { 0xFF, 0xD8 }; | ||
|
|
||
| /// <summary>The set of bytes expected to be present at the start of TIFF (little-endian) files.</summary> | ||
| private static byte[] TiffLeHeader = new byte[] { 0x49, 0x49, 0x2A, 0x00 }; | ||
|
|
||
| /// <summary>The set of bytes expected to be present at the start of TIFF (big-endian) files.</summary> | ||
| private static byte[] TiffBeHeader = new byte[] { 0x4D, 0x4D, 0x00, 0x2A }; | ||
|
|
||
| /// <summary> | ||
| /// Attemps to detect the <see cref="ContentType"/> of a stream of bytes. The algorithm searches through | ||
| /// the first set of bytes in the stream and compares it to well-known file signatures. | ||
| /// </summary> | ||
| /// <param name="stream">The stream to which the content type detection attempt will be performed.</param> | ||
| /// <param name="contentType">If the detection is successful, outputs the detected content type. Otherwise, <c>default</c>.</param> | ||
| /// <returns><c>true</c> if the detection was successful. Otherwise, <c>false</c>.</returns> | ||
| /// <exception cref="NotSupportedException">Happens when <paramref name="stream"/> is not seekable or readable.</exception> | ||
| public static bool TryGetContentType(this Stream stream, out ContentType contentType) | ||
| { | ||
| if (stream.BeginsWithHeader(PdfHeader)) | ||
| { | ||
| contentType = ContentType.Pdf; | ||
| } | ||
| else if (stream.BeginsWithHeader(PngHeader)) | ||
| { | ||
| contentType = ContentType.Png; | ||
| } | ||
| else if (stream.BeginsWithHeader(JpegHeader)) | ||
| { | ||
| contentType = ContentType.Jpeg; | ||
| } | ||
| else if (stream.BeginsWithHeader(TiffLeHeader) || stream.BeginsWithHeader(TiffBeHeader)) | ||
| { | ||
| contentType = ContentType.Tiff; | ||
| } | ||
| else | ||
| { | ||
| contentType = default; | ||
| return false; | ||
| } | ||
|
|
||
| return true; | ||
| } | ||
kinelski marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| /// <summary> | ||
| /// Determines whether a stream begins with a specified sequence of bytes. | ||
| /// </summary> | ||
| /// <param name="stream">The stream to be verified.</param> | ||
| /// <param name="header">The sequence of bytes expected to be at the start of <paramref name="stream"/>.</param> | ||
| /// <returns><c>true</c> if the <paramref name="stream"/> begins with the specified <paramref name="header"/>. Otherwise, <c>false</c>.</returns> | ||
| private static bool BeginsWithHeader(this Stream stream, byte[] header) | ||
| { | ||
| var originalPosition = stream.Position; | ||
|
|
||
| if (stream.Length - originalPosition < header.Length) | ||
| { | ||
| return false; | ||
| } | ||
|
|
||
| foreach (var headerByte in header) | ||
| { | ||
| var streamByte = (byte)stream.ReadByte(); | ||
|
|
||
| if (streamByte != headerByte) | ||
| { | ||
| stream.Position = originalPosition; | ||
| return false; | ||
| } | ||
| } | ||
|
|
||
| stream.Position = originalPosition; | ||
| return true; | ||
| } | ||
| } | ||
| } | ||
Binary file added
BIN
+74.9 KB
sdk/formrecognizer/Azure.AI.FormRecognizer/tests/Assets/Invoice_1.tiff
Binary file not shown.
Binary file added
BIN
+1.86 MB
sdk/formrecognizer/Azure.AI.FormRecognizer/tests/Assets/contoso-allinone.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
105 changes: 105 additions & 0 deletions
105
sdk/formrecognizer/Azure.AI.FormRecognizer/tests/Extensions/StreamExtensionsTests.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,105 @@ | ||
| // Copyright (c) Microsoft Corporation. All rights reserved. | ||
| // Licensed under the MIT License. | ||
|
|
||
| using System; | ||
| using System.IO; | ||
| using System.Text; | ||
| using Azure.AI.FormRecognizer.Models; | ||
| using NUnit.Framework; | ||
|
|
||
| namespace Azure.AI.FormRecognizer.Tests | ||
| { | ||
| /// <summary> | ||
| /// The suite of tests for the <see cref="StreamExtensions"/> class. | ||
| /// </summary> | ||
| public class StreamExtensionsTests | ||
| { | ||
| /// <summary> | ||
| /// Verifies functionality of the <see cref="StreamExtensions.TryGetContentType"/> method. | ||
| /// </summary> | ||
| [Test] | ||
| public void TryGetContentTypeDetectsPdf() | ||
| { | ||
| using var stream = new FileStream(TestEnvironment.RetrieveInvoicePath(1, ContentType.Pdf), FileMode.Open); | ||
|
|
||
| Assert.True(stream.TryGetContentType(out var contentType)); | ||
| Assert.AreEqual(ContentType.Pdf, contentType); | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Verifies functionality of the <see cref="StreamExtensions.TryGetContentType"/> method. | ||
| /// </summary> | ||
| [Test] | ||
| public void TryGetContentTypeDetectsPng() | ||
| { | ||
| using var stream = new FileStream(TestEnvironment.PngReceiptPath, FileMode.Open); | ||
|
|
||
| Assert.True(stream.TryGetContentType(out var contentType)); | ||
| Assert.AreEqual(ContentType.Png, contentType); | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Verifies functionality of the <see cref="StreamExtensions.TryGetContentType"/> method. | ||
| /// </summary> | ||
| [Test] | ||
| public void TryGetContentTypeDetectsJpeg() | ||
| { | ||
| using var stream = new FileStream(TestEnvironment.JpgReceiptPath, FileMode.Open); | ||
|
|
||
| Assert.True(stream.TryGetContentType(out var contentType)); | ||
| Assert.AreEqual(ContentType.Jpeg, contentType); | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Verifies functionality of the <see cref="StreamExtensions.TryGetContentType"/> method. | ||
| /// </summary> | ||
| [Test] | ||
| public void TryGetContentTypeDetectsLittleEndianTiff() | ||
| { | ||
| using var stream = new FileStream(TestEnvironment.RetrieveInvoicePath(1, ContentType.Tiff), FileMode.Open); | ||
|
|
||
| Assert.True(stream.TryGetContentType(out var contentType)); | ||
| Assert.AreEqual(ContentType.Tiff, contentType); | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Verifies functionality of the <see cref="StreamExtensions.TryGetContentType"/> method. | ||
| /// </summary> | ||
| [Test] | ||
| public void TryGetContentTypeDetectsBigEndianTiff() | ||
| { | ||
| // Currently there are no big-endian TIFF files available in the test assets, so | ||
| // we'll simulate one in a MemoryStream. These files start with the "MM\0*" header | ||
| // in ASCII encoding. | ||
|
|
||
| using var stream = new MemoryStream(Encoding.ASCII.GetBytes("MM\0*I am a completely normal TIFF file. Trust me.")); | ||
kinelski marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
|
||
| Assert.True(stream.TryGetContentType(out var contentType)); | ||
| Assert.AreEqual(ContentType.Tiff, contentType); | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Verifies functionality of the <see cref="StreamExtensions.TryGetContentType"/> method. | ||
| /// </summary> | ||
| [Test] | ||
| public void TryGetContentTypeCannotDetectUnknownType() | ||
| { | ||
| using var stream = new MemoryStream(Encoding.UTF8.GetBytes("I am probably unknown.")); | ||
|
|
||
| Assert.False(stream.TryGetContentType(out var contentType)); | ||
| Assert.AreEqual(default(ContentType), contentType); | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Verifies functionality of the <see cref="StreamExtensions.TryGetContentType"/> method. | ||
| /// </summary> | ||
| [Test] | ||
| public void TryGetContentTypeDoesNotThrowForEmptyStream() | ||
| { | ||
| using var stream = new MemoryStream(Array.Empty<byte>()); | ||
|
|
||
| Assert.False(stream.TryGetContentType(out var contentType)); | ||
| Assert.AreEqual(default(ContentType), contentType); | ||
| } | ||
| } | ||
| } | ||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.