Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
107 changes: 52 additions & 55 deletions sdk/formrecognizer/Azure.AI.FormRecognizer/src/FormRecognizerClient.cs

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@ public enum ServiceVersion
#pragma warning restore CA1707 // Identifiers should not contain underscores
}


/// <summary>
/// </summary>
public ServiceVersion Version { get; }
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,6 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

using System;
using System.Collections.Generic;
using System.Text;

namespace Azure.AI.FormRecognizer.Models
{
/// <summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,8 @@ public RecognizeOptions()
/// </summary>
public bool IncludeTextContent { get; set; } = false;

/// <summary>
/// </summary>
public ContentType? ContentType { get; set; } = null;
}
}
92 changes: 92 additions & 0 deletions sdk/formrecognizer/Azure.AI.FormRecognizer/src/StreamExtensions.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

using System;
using System.IO;
using Azure.AI.FormRecognizer.Models;

namespace Azure.AI.FormRecognizer
{
internal static class StreamExtensions
{
/// <summary>The set of bytes expected to be present at the start of PDF files.</summary>
private static byte[] PdfHeader = new byte[] { 0x25, 0x50, 0x44, 0x46 };

/// <summary>The set of bytes expected to be present at the start of PNG files.</summary>
private static byte[] PngHeader = new byte[] { 0x89, 0x50, 0x4E, 0x47 };

/// <summary>The set of bytes expected to be present at the start of JPEG files.</summary>
private static byte[] JpegHeader = new byte[] { 0xFF, 0xD8 };

/// <summary>The set of bytes expected to be present at the start of TIFF (little-endian) files.</summary>
private static byte[] TiffLeHeader = new byte[] { 0x49, 0x49, 0x2A, 0x00 };

/// <summary>The set of bytes expected to be present at the start of TIFF (big-endian) files.</summary>
private static byte[] TiffBeHeader = new byte[] { 0x4D, 0x4D, 0x00, 0x2A };

/// <summary>
/// Attemps to detect the <see cref="ContentType"/> of a stream of bytes. The algorithm searches through
/// the first set of bytes in the stream and compares it to well-known file signatures.
/// </summary>
/// <param name="stream">The stream to which the content type detection attempt will be performed.</param>
/// <param name="contentType">If the detection is successful, outputs the detected content type. Otherwise, <c>default</c>.</param>
/// <returns><c>true</c> if the detection was successful. Otherwise, <c>false</c>.</returns>
/// <exception cref="NotSupportedException">Happens when <paramref name="stream"/> is not seekable or readable.</exception>
public static bool TryGetContentType(this Stream stream, out ContentType contentType)
{
if (stream.BeginsWithHeader(PdfHeader))
{
contentType = ContentType.Pdf;
}
else if (stream.BeginsWithHeader(PngHeader))
{
contentType = ContentType.Png;
}
else if (stream.BeginsWithHeader(JpegHeader))
{
contentType = ContentType.Jpeg;
}
else if (stream.BeginsWithHeader(TiffLeHeader) || stream.BeginsWithHeader(TiffBeHeader))
{
contentType = ContentType.Tiff;
}
else
{
contentType = default;
return false;
}

return true;
}

/// <summary>
/// Determines whether a stream begins with a specified sequence of bytes.
/// </summary>
/// <param name="stream">The stream to be verified.</param>
/// <param name="header">The sequence of bytes expected to be at the start of <paramref name="stream"/>.</param>
/// <returns><c>true</c> if the <paramref name="stream"/> begins with the specified <paramref name="header"/>. Otherwise, <c>false</c>.</returns>
private static bool BeginsWithHeader(this Stream stream, byte[] header)
{
var originalPosition = stream.Position;

if (stream.Length - originalPosition < header.Length)
{
return false;
}

foreach (var headerByte in header)
{
var streamByte = (byte)stream.ReadByte();

if (streamByte != headerByte)
{
stream.Position = originalPosition;
return false;
}
}

stream.Position = originalPosition;
return true;
}
}
}
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

using System;
using System.IO;
using System.Text;
using Azure.AI.FormRecognizer.Models;
using NUnit.Framework;

namespace Azure.AI.FormRecognizer.Tests
{
/// <summary>
/// The suite of tests for the <see cref="StreamExtensions"/> class.
/// </summary>
public class StreamExtensionsTests
{
/// <summary>
/// Verifies functionality of the <see cref="StreamExtensions.TryGetContentType"/> method.
/// </summary>
[Test]
public void TryGetContentTypeDetectsPdf()
{
using var stream = new FileStream(TestEnvironment.RetrieveInvoicePath(1, ContentType.Pdf), FileMode.Open);

Assert.True(stream.TryGetContentType(out var contentType));
Assert.AreEqual(ContentType.Pdf, contentType);
}

/// <summary>
/// Verifies functionality of the <see cref="StreamExtensions.TryGetContentType"/> method.
/// </summary>
[Test]
public void TryGetContentTypeDetectsPng()
{
using var stream = new FileStream(TestEnvironment.PngReceiptPath, FileMode.Open);

Assert.True(stream.TryGetContentType(out var contentType));
Assert.AreEqual(ContentType.Png, contentType);
}

/// <summary>
/// Verifies functionality of the <see cref="StreamExtensions.TryGetContentType"/> method.
/// </summary>
[Test]
public void TryGetContentTypeDetectsJpeg()
{
using var stream = new FileStream(TestEnvironment.JpgReceiptPath, FileMode.Open);

Assert.True(stream.TryGetContentType(out var contentType));
Assert.AreEqual(ContentType.Jpeg, contentType);
}

/// <summary>
/// Verifies functionality of the <see cref="StreamExtensions.TryGetContentType"/> method.
/// </summary>
[Test]
public void TryGetContentTypeDetectsLittleEndianTiff()
{
using var stream = new FileStream(TestEnvironment.RetrieveInvoicePath(1, ContentType.Tiff), FileMode.Open);

Assert.True(stream.TryGetContentType(out var contentType));
Assert.AreEqual(ContentType.Tiff, contentType);
}

/// <summary>
/// Verifies functionality of the <see cref="StreamExtensions.TryGetContentType"/> method.
/// </summary>
[Test]
public void TryGetContentTypeDetectsBigEndianTiff()
{
// Currently there are no big-endian TIFF files available in the test assets, so
// we'll simulate one in a MemoryStream. These files start with the "MM\0*" header
// in ASCII encoding.

using var stream = new MemoryStream(Encoding.ASCII.GetBytes("MM\0*I am a completely normal TIFF file. Trust me."));

Assert.True(stream.TryGetContentType(out var contentType));
Assert.AreEqual(ContentType.Tiff, contentType);
}

/// <summary>
/// Verifies functionality of the <see cref="StreamExtensions.TryGetContentType"/> method.
/// </summary>
[Test]
public void TryGetContentTypeCannotDetectUnknownType()
{
using var stream = new MemoryStream(Encoding.UTF8.GetBytes("I am probably unknown."));

Assert.False(stream.TryGetContentType(out var contentType));
Assert.AreEqual(default(ContentType), contentType);
}

/// <summary>
/// Verifies functionality of the <see cref="StreamExtensions.TryGetContentType"/> method.
/// </summary>
[Test]
public void TryGetContentTypeDoesNotThrowForEmptyStream()
{
using var stream = new MemoryStream(Array.Empty<byte>());

Assert.False(stream.TryGetContentType(out var contentType));
Assert.AreEqual(default(ContentType), contentType);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ public async Task StartRecognizeContentPopulatesFormPage(bool useStream)

if (useStream)
{
using var stream = new FileStream(TestEnvironment.RetrieveInvoicePath(1), FileMode.Open);
using var stream = new FileStream(TestEnvironment.RetrieveInvoicePath(1, ContentType.Pdf), FileMode.Open);
operation = await client.StartRecognizeContentAsync(stream);
}
else
Expand Down Expand Up @@ -135,12 +135,12 @@ public async Task StartRecognizeReceiptsPopulatesExtractedReceipt(bool useStream

if (useStream)
{
using var stream = new FileStream(TestEnvironment.ReceiptPath, FileMode.Open);
operation = await client.StartRecognizeReceiptsAsync(stream, ContentType.Jpeg);
using var stream = new FileStream(TestEnvironment.JpgReceiptPath, FileMode.Open);
operation = await client.StartRecognizeReceiptsAsync(stream);
}
else
{
var uri = new Uri(TestEnvironment.ReceiptUri);
var uri = new Uri(TestEnvironment.JpgReceiptUri);
operation = await client.StartRecognizeReceiptsFromUriAsync(uri, default);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,12 +83,13 @@ public void StartRecognizeContentRequiresTheFormFileStream()
public void StartRecognizeContentRespectsTheCancellationToken()
{
var client = CreateInstrumentedClient();
var options = new RecognizeOptions { ContentType = ContentType.Pdf };

using var stream = new MemoryStream(Array.Empty<byte>());
using var cancellationSource = new CancellationTokenSource();
cancellationSource.Cancel();

Assert.ThrowsAsync<TaskCanceledException>(async () => await client.StartRecognizeContentAsync(stream, default, cancellationSource.Token));
Assert.ThrowsAsync<TaskCanceledException>(async () => await client.StartRecognizeContentAsync(stream, options, cancellationSource.Token));
}

/// <summary>
Expand Down Expand Up @@ -116,7 +117,7 @@ public void StartRecognizeContentFromUriRespectsTheCancellationToken()
using var cancellationSource = new CancellationTokenSource();
cancellationSource.Cancel();

Assert.ThrowsAsync<TaskCanceledException>(async () => await client.StartRecognizeContentFromUriAsync(fakeUri, default, cancellationSource.Token));
Assert.ThrowsAsync<TaskCanceledException>(async () => await client.StartRecognizeContentFromUriAsync(fakeUri, cancellationToken: cancellationSource.Token));
}

/// <summary>
Expand All @@ -128,7 +129,7 @@ public void StartRecognizeContentFromUriRespectsTheCancellationToken()
public void StartRecognizeReceiptsRequiresTheReceiptFileStream()
{
var client = CreateInstrumentedClient();
Assert.ThrowsAsync<ArgumentNullException>(async () => await client.StartRecognizeReceiptsAsync(null, ContentType.Jpeg));
Assert.ThrowsAsync<ArgumentNullException>(async () => await client.StartRecognizeReceiptsAsync(null));
}

/// <summary>
Expand All @@ -139,12 +140,13 @@ public void StartRecognizeReceiptsRequiresTheReceiptFileStream()
public void StartRecognizeReceiptsRespectsTheCancellationToken()
{
var client = CreateInstrumentedClient();
var options = new RecognizeOptions { ContentType = ContentType.Pdf };

using var stream = new MemoryStream(Array.Empty<byte>());
using var cancellationSource = new CancellationTokenSource();
cancellationSource.Cancel();

Assert.ThrowsAsync<TaskCanceledException>(async () => await client.StartRecognizeReceiptsAsync(stream, ContentType.Jpeg, cancellationToken: cancellationSource.Token));
Assert.ThrowsAsync<TaskCanceledException>(async () => await client.StartRecognizeReceiptsAsync(stream, recognizeOptions: options, cancellationToken: cancellationSource.Token));
}

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

using System;
using System.IO;
using System.Reflection;
using Azure.AI.FormRecognizer.Models;

namespace Azure.AI.FormRecognizer.Tests
{
Expand All @@ -21,35 +23,58 @@ public static class TestEnvironment
private const string AssetsFolderName = "Assets";

/// <summary>The name of the JPG file which contains the receipt to be used for tests.</summary>
private const string ReceiptFilename = "contoso-receipt.jpg";
private const string JpgReceiptFilename = "contoso-receipt.jpg";

/// <summary>The format to generate the filenames of the PDF forms to be used for tests.</summary>
private const string InvoiceFilenameFormat = "Invoice_{0}.pdf";
/// <summary>The name of the PNG file which contains the receipt to be used for tests.</summary>
private const string PngReceiptFilename = "contoso-allinone.png";

/// <summary>The format to generate the filenames of the forms to be used for tests.</summary>
private const string InvoiceFilenameFormat = "Invoice_{0}.{1}";

/// <summary>The format to generate the GitHub URIs of the files to be used for tests.</summary>
private const string FileUriFormat = "https://raw.githubusercontent.com/Azure/azure-sdk-for-net/master/sdk/formrecognizer/Azure.AI.FormRecognizer/tests/{0}/{1}";

/// <summary>
/// The name of the directory where the running assembly is located.
/// </summary>
/// <value>The name of the current working directory.</value>
private static string CurrentWorkingDirectory => Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location);

/// <summary>
/// The relative path to the JPG file which contains the receipt to be used for tests.
/// </summary>
/// <value>The relative path to the JPG file.</value>
public static string ReceiptPath => Path.Combine(Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location), AssetsFolderName, ReceiptFilename);
public static string JpgReceiptPath => Path.Combine(CurrentWorkingDirectory, AssetsFolderName, JpgReceiptFilename);

/// <summary>
/// The relative path to the PNG file which contains the receipt to be used for tests.
/// </summary>
/// <value>The relative path to the PNG file.</value>
public static string PngReceiptPath => Path.Combine(CurrentWorkingDirectory, AssetsFolderName, PngReceiptFilename);

/// <summary>
/// The URI string to the JPG file which contains the receipt to be used for tests.
/// </summary>
/// <value>The URI string to the JPG file.</value>
public static string ReceiptUri => string.Format(FileUriFormat, AssetsFolderName, ReceiptFilename);
public static string JpgReceiptUri => string.Format(FileUriFormat, AssetsFolderName, JpgReceiptFilename);

/// <summary>
/// Retrieves the relative path to a PDF form available in the test assets.
/// Retrieves the relative path to a PDF or TIFF form available in the test assets.
/// </summary>
/// <param name="index">The index to specify the form to be retrieved.</param>
/// <returns>The relative path to the PDF form corresponding to the specified index.</returns>
public static string RetrieveInvoicePath(int index)
/// <param name="contentType">The type of the form to be retrieved. Currently only PDF and TIFF are available.</param>
/// <returns>The relative path to the PDF or TIFF form corresponding to the specified index.</returns>
public static string RetrieveInvoicePath(int index, ContentType contentType)
{
var filename = string.Format(InvoiceFilenameFormat, index);
return Path.Combine(Path.GetDirectoryName(Assembly.GetExecutingAssembly().Location), AssetsFolderName, filename);
var extension = contentType switch
{
ContentType.Pdf => "pdf",
ContentType.Tiff => "tiff",
_ => throw new ArgumentException("The requested content type is not available.", nameof(contentType))
};

var filename = string.Format(InvoiceFilenameFormat, index, extension);
return Path.Combine(CurrentWorkingDirectory, AssetsFolderName, filename);
}

/// <summary>
Expand All @@ -59,7 +84,7 @@ public static string RetrieveInvoicePath(int index)
/// <returns>The URI string to the PDF form corresponding to the specified index.</returns>
public static string RetrieveInvoiceUri(int index)
{
var filename = string.Format(InvoiceFilenameFormat, index);
var filename = string.Format(InvoiceFilenameFormat, index, "pdf");
return string.Format(FileUriFormat, AssetsFolderName, filename);
}
}
Expand Down