Skip to content
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions eng/Versions.props
Original file line number Diff line number Diff line change
Expand Up @@ -171,5 +171,6 @@
<!-- MEVD is still part of the Semantic Kernel repo -->
<MicrosoftExtensionsVectorDataAbstractionsVersion>9.7.0</MicrosoftExtensionsVectorDataAbstractionsVersion>
<MicrosoftSemanticKernelConnectorsVersion>1.66.0-preview</MicrosoftSemanticKernelConnectorsVersion>
<MarkdigSignedVersion>0.43.0</MarkdigSignedVersion>
</PropertyGroup>
</Project>
1 change: 1 addition & 0 deletions eng/packages/General.props
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
<PackageVersion Include="Azure.AI.Inference" Version="1.0.0-beta.5" />
<PackageVersion Include="DnsClient" Version="1.8.0" />
<PackageVersion Include="ICSharpCode.Decompiler" Version="9.1.0.7988" />
<PackageVersion Include="Markdig.Signed" Version="$(MarkdigSignedVersion)" />
<PackageVersion Include="Microsoft.Bcl.HashCode" Version="1.1.1" />
<PackageVersion Include="Microsoft.CodeAnalysis.Analyzers" Version="$(MicrosoftCodeAnalysisAnalyzersVersion)" />
<PackageVersion Include="Microsoft.CodeAnalysis.Common" Version="$(MicrosoftCodeAnalysisVersion)" />
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Diagnostics;
using System.IO;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Shared.Diagnostics;

namespace Microsoft.Extensions.DataIngestion;

/// <summary>
/// Reads documents by converting them to Markdown using the <see href="https://github.com/microsoft/markitdown">MarkItDown</see> tool.
/// </summary>
public class MarkItDownReader : IngestionDocumentReader
{
private readonly string _exePath;
private readonly bool _extractImages;

/// <summary>
/// Initializes a new instance of the <see cref="MarkItDownReader"/> class.
/// </summary>
/// <param name="exePath">The path to the MarkItDown executable. When not provided, "markitdown" needs to be added to PATH.</param>
/// <param name="extractImages">A value indicating whether to extract images.</param>
public MarkItDownReader(string exePath = "markitdown", bool extractImages = false)
{
_exePath = Throw.IfNullOrEmpty(exePath);
_extractImages = extractImages;
}

/// <inheritdoc/>
public override async Task<IngestionDocument> ReadAsync(FileInfo source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default)
{
_ = Throw.IfNull(source);
_ = Throw.IfNullOrEmpty(identifier);

if (!source.Exists)
{
throw new FileNotFoundException("The specified file does not exist.", source.FullName);
}

ProcessStartInfo startInfo = new()
{
FileName = _exePath,
UseShellExecute = false,
CreateNoWindow = true,
RedirectStandardOutput = true,
StandardOutputEncoding = Encoding.UTF8,
};

// Force UTF-8 encoding in the environment (will produce garbage otherwise).
startInfo.Environment["PYTHONIOENCODING"] = "utf-8";
startInfo.Environment["LC_ALL"] = "C.UTF-8";
startInfo.Environment["LANG"] = "C.UTF-8";

#if NET
startInfo.ArgumentList.Add(source.FullName);
if (_extractImages)
{
startInfo.ArgumentList.Add("--keep-data-uris");
}
#else
startInfo.Arguments = $"\"{source.FullName}\"" + (_extractImages ? " --keep-data-uris" : string.Empty);
#endif

string outputContent = string.Empty;
using (Process process = new() { StartInfo = startInfo })
{
process.Start();

outputContent = await process.StandardOutput.ReadToEndAsync(cancellationToken).ConfigureAwait(false);
#if NET
await process.WaitForExitAsync(cancellationToken).ConfigureAwait(false);
#else
process.WaitForExit();
#endif

if (process.ExitCode != 0)
{
throw new InvalidOperationException($"MarkItDown process failed with exit code {process.ExitCode}.");
}
}

return MarkdownParser.Parse(outputContent, identifier);
}

/// <inheritdoc/>
/// <remarks>The contents of <paramref name="source"/> are copied to a temporary file.</remarks>
public override async Task<IngestionDocument> ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default)
{
_ = Throw.IfNull(source);
_ = Throw.IfNullOrEmpty(identifier);

// Instead of creating a temporary file, we could write to the StandardInput of the process.
// MarkItDown says it supports reading from stdin, but it does not work as expected.
// Even the sample command line does not work with stdin: "cat example.pdf | markitdown"
// I can be doing something wrong, but for now, let's write to a temporary file.
string inputFilePath = Path.Combine(Path.GetTempPath(), Path.GetRandomFileName());
using (FileStream inputFile = new(inputFilePath, FileMode.CreateNew, FileAccess.Write, FileShare.None, bufferSize: 1, FileOptions.Asynchronous))
{
await source
#if NET
.CopyToAsync(inputFile, cancellationToken)
#else
.CopyToAsync(inputFile)
#endif
.ConfigureAwait(false);
}

try
{
return await ReadAsync(new FileInfo(inputFilePath), identifier, mediaType, cancellationToken).ConfigureAwait(false);
}
finally
{
File.Delete(inputFilePath);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFrameworks>$(TargetFrameworks);netstandard2.0</TargetFrameworks>
<RootNamespace>Microsoft.Extensions.DataIngestion</RootNamespace>

<!-- we are not ready to publish yet -->
<IsPackable>false</IsPackable>
<Stage>preview</Stage>
<EnablePackageValidation>false</EnablePackageValidation>
</PropertyGroup>

<ItemGroup>
<Compile Include="..\Microsoft.Extensions.DataIngestion.Markdig\MarkdownParser.cs" Link="MarkdownParser.cs" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\Microsoft.Extensions.DataIngestion.Abstractions\Microsoft.Extensions.DataIngestion.Abstractions.csproj" />
</ItemGroup>

<ItemGroup>
<PackageReference Include="Markdig.Signed" />
</ItemGroup>

</Project>
Loading
Loading