Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions eng/Versions.props
Original file line number Diff line number Diff line change
Expand Up @@ -171,5 +171,6 @@
<!-- MEVD is still part of the Semantic Kernel repo -->
<MicrosoftExtensionsVectorDataAbstractionsVersion>9.7.0</MicrosoftExtensionsVectorDataAbstractionsVersion>
<MicrosoftSemanticKernelConnectorsVersion>1.66.0-preview</MicrosoftSemanticKernelConnectorsVersion>
<MarkdigVersion>0.42.0</MarkdigVersion>
</PropertyGroup>
</Project>
1 change: 1 addition & 0 deletions eng/packages/General.props
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
<PackageVersion Include="Azure.AI.Inference" Version="1.0.0-beta.5" />
<PackageVersion Include="DnsClient" Version="1.8.0" />
<PackageVersion Include="ICSharpCode.Decompiler" Version="9.1.0.7988" />
<PackageVersion Include="Markdig" Version="$(MarkdigVersion)" />
<PackageVersion Include="Microsoft.Bcl.HashCode" Version="1.1.1" />
<PackageVersion Include="Microsoft.CodeAnalysis.Analyzers" Version="$(MicrosoftCodeAnalysisAnalyzersVersion)" />
<PackageVersion Include="Microsoft.CodeAnalysis.Common" Version="$(MicrosoftCodeAnalysisVersion)" />
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Diagnostics;
using System.IO;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Shared.Diagnostics;

namespace Microsoft.Extensions.DataIngestion;

/// <summary>
/// Reads documents by converting them to Markdown using the <see href="https://github.com/microsoft/markitdown">MarkItDown</see> tool.
/// </summary>
public class MarkItDownReader : IngestionDocumentReader
{
private readonly string _exePath;
private readonly bool _extractImages;

/// <summary>
/// Initializes a new instance of the <see cref="MarkItDownReader"/> class.
/// </summary>
/// <param name="exePath">The path to the MarkItDown executable. When not provided, "markitdown" needs to be added to PATH.</param>
/// <param name="extractImages">A value indicating whether to extract images.</param>
public MarkItDownReader(string exePath = "markitdown", bool extractImages = false)
{
_exePath = Throw.IfNullOrEmpty(exePath);
_extractImages = extractImages;
}

/// <inheritdoc/>
public override async Task<IngestionDocument> ReadAsync(FileInfo source, string identifier, string? mediaType = null, CancellationToken cancellationToken = default)
{
_ = Throw.IfNull(source);
_ = Throw.IfNullOrEmpty(identifier);

if (!source.Exists)
{
throw new FileNotFoundException("The specified file does not exist.", source.FullName);
}

ProcessStartInfo startInfo = new()
{
FileName = _exePath,
UseShellExecute = false,
CreateNoWindow = true,
RedirectStandardOutput = true,
StandardOutputEncoding = Encoding.UTF8,
};

// Force UTF-8 encoding in the environment (will produce garbage otherwise).
startInfo.Environment["PYTHONIOENCODING"] = "utf-8";
startInfo.Environment["LC_ALL"] = "C.UTF-8";
startInfo.Environment["LANG"] = "C.UTF-8";

#if NET
startInfo.ArgumentList.Add(source.FullName);
if (_extractImages)
{
startInfo.ArgumentList.Add("--keep-data-uris");
}
#else
startInfo.Arguments = $"\"{source.FullName}\"" + (_extractImages ? " --keep-data-uris" : string.Empty);
#endif

string outputContent = string.Empty;
using (Process process = new() { StartInfo = startInfo })
{
process.Start();

#if NET
outputContent = await process.StandardOutput.ReadToEndAsync(cancellationToken).ConfigureAwait(false);
await process.WaitForExitAsync(cancellationToken).ConfigureAwait(false);
#else
outputContent = await process.StandardOutput.ReadToEndAsync().ConfigureAwait(false);
process.WaitForExit();
#endif

if (process.ExitCode != 0)
{
throw new InvalidOperationException($"MarkItDown process failed with exit code {process.ExitCode}.");
}
}

return MarkdownParser.Parse(outputContent, identifier);
}

/// <inheritdoc/>
/// <remarks>The contents of <paramref name="source"/> are copied to a temporary file.</remarks>
public override async Task<IngestionDocument> ReadAsync(Stream source, string identifier, string mediaType, CancellationToken cancellationToken = default)
{
_ = Throw.IfNull(source);
_ = Throw.IfNullOrEmpty(identifier);

// Instead of creating a temporary file, we could write to the StandardInput of the process.
// MarkItDown says it supports reading from stdin, but it does not work as expected.
// Even the sample command line does not work with stdin: "cat example.pdf | markitdown"
// I can be doing something wrong, but for now, let's write to a temporary file.
string inputFilePath = Path.Combine(Path.GetTempPath(), Path.GetRandomFileName());
using (FileStream inputFile = new(inputFilePath, FileMode.CreateNew, FileAccess.Write, FileShare.None, bufferSize: 1, FileOptions.Asynchronous))
{
await source
#if NET
.CopyToAsync(inputFile, cancellationToken)
#else
.CopyToAsync(inputFile)
#endif
.ConfigureAwait(false);
}

try
{
return await ReadAsync(new FileInfo(inputFilePath), identifier, mediaType, cancellationToken).ConfigureAwait(false);
}
finally
{
File.Delete(inputFilePath);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFrameworks>$(TargetFrameworks);netstandard2.0</TargetFrameworks>
<RootNamespace>Microsoft.Extensions.DataIngestion</RootNamespace>

<!-- Markdig is not signed -->
<SignAssembly>false</SignAssembly>

<!-- we are not ready to publish yet -->
<IsPackable>false</IsPackable>
<Stage>preview</Stage>
<EnablePackageValidation>false</EnablePackageValidation>
</PropertyGroup>

<ItemGroup>
<Compile Include="..\Microsoft.Extensions.DataIngestion.Markdown\MarkdownParser.cs" Link="MarkdownParser.cs" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\Microsoft.Extensions.DataIngestion.Abstractions\Microsoft.Extensions.DataIngestion.Abstractions.csproj" />
</ItemGroup>

<ItemGroup>
<PackageReference Include="Markdig" />
</ItemGroup>

</Project>
Loading
Loading